fs/ext4/super.c at v6.4-rc6

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / fs / ext4 / super.c
at v6.4-rc6 7365 lines 208 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/ext4/super.c
   4 *
   5 * Copyright (C) 1992, 1993, 1994, 1995
   6 * Remy Card (card@masi.ibp.fr)
   7 * Laboratoire MASI - Institut Blaise Pascal
   8 * Universite Pierre et Marie Curie (Paris VI)
   9 *
  10 *  from
  11 *
  12 *  linux/fs/minix/inode.c
  13 *
  14 *  Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 *  Big-endian to little-endian byte-swapping/bitmaps by
  17 *        David S. Miller (davem@caip.rutgers.edu), 1995
  18 */
  19
  20#include <linux/module.h>
  21#include <linux/string.h>
  22#include <linux/fs.h>
  23#include <linux/time.h>
  24#include <linux/vmalloc.h>
  25#include <linux/slab.h>
  26#include <linux/init.h>
  27#include <linux/blkdev.h>
  28#include <linux/backing-dev.h>
  29#include <linux/parser.h>
  30#include <linux/buffer_head.h>
  31#include <linux/exportfs.h>
  32#include <linux/vfs.h>
  33#include <linux/random.h>
  34#include <linux/mount.h>
  35#include <linux/namei.h>
  36#include <linux/quotaops.h>
  37#include <linux/seq_file.h>
  38#include <linux/ctype.h>
  39#include <linux/log2.h>
  40#include <linux/crc16.h>
  41#include <linux/dax.h>
  42#include <linux/uaccess.h>
  43#include <linux/iversion.h>
  44#include <linux/unicode.h>
  45#include <linux/part_stat.h>
  46#include <linux/kthread.h>
  47#include <linux/freezer.h>
  48#include <linux/fsnotify.h>
  49#include <linux/fs_context.h>
  50#include <linux/fs_parser.h>
  51
  52#include "ext4.h"
  53#include "ext4_extents.h"	/* Needed for trace points definition */
  54#include "ext4_jbd2.h"
  55#include "xattr.h"
  56#include "acl.h"
  57#include "mballoc.h"
  58#include "fsmap.h"
  59
  60#define CREATE_TRACE_POINTS
  61#include <trace/events/ext4.h>
  62
  63static struct ext4_lazy_init *ext4_li_info;
  64static DEFINE_MUTEX(ext4_li_mtx);
  65static struct ratelimit_state ext4_mount_msg_ratelimit;
  66
  67static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  68			     unsigned long journal_devnum);
  69static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  70static void ext4_update_super(struct super_block *sb);
  71static int ext4_commit_super(struct super_block *sb);
  72static int ext4_mark_recovery_complete(struct super_block *sb,
  73					struct ext4_super_block *es);
  74static int ext4_clear_journal_err(struct super_block *sb,
  75				  struct ext4_super_block *es);
  76static int ext4_sync_fs(struct super_block *sb, int wait);
  77static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  78static int ext4_unfreeze(struct super_block *sb);
  79static int ext4_freeze(struct super_block *sb);
  80static inline int ext2_feature_set_ok(struct super_block *sb);
  81static inline int ext3_feature_set_ok(struct super_block *sb);
  82static void ext4_destroy_lazyinit_thread(void);
  83static void ext4_unregister_li_request(struct super_block *sb);
  84static void ext4_clear_request_list(void);
  85static struct inode *ext4_get_journal_inode(struct super_block *sb,
  86					    unsigned int journal_inum);
  87static int ext4_validate_options(struct fs_context *fc);
  88static int ext4_check_opt_consistency(struct fs_context *fc,
  89				      struct super_block *sb);
  90static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
  91static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
  92static int ext4_get_tree(struct fs_context *fc);
  93static int ext4_reconfigure(struct fs_context *fc);
  94static void ext4_fc_free(struct fs_context *fc);
  95static int ext4_init_fs_context(struct fs_context *fc);
  96static const struct fs_parameter_spec ext4_param_specs[];
  97
  98/*
  99 * Lock ordering
 100 *
 101 * page fault path:
 102 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
 103 *   -> page lock -> i_data_sem (rw)
 104 *
 105 * buffered write path:
 106 * sb_start_write -> i_mutex -> mmap_lock
 107 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 108 *   i_data_sem (rw)
 109 *
 110 * truncate:
 111 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
 112 *   page lock
 113 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
 114 *   i_data_sem (rw)
 115 *
 116 * direct IO:
 117 * sb_start_write -> i_mutex -> mmap_lock
 118 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 119 *
 120 * writepages:
 121 * transaction start -> page lock(s) -> i_data_sem (rw)
 122 */
 123
 124static const struct fs_context_operations ext4_context_ops = {
 125	.parse_param	= ext4_parse_param,
 126	.get_tree	= ext4_get_tree,
 127	.reconfigure	= ext4_reconfigure,
 128	.free		= ext4_fc_free,
 129};
 130
 131
 132#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 133static struct file_system_type ext2_fs_type = {
 134	.owner			= THIS_MODULE,
 135	.name			= "ext2",
 136	.init_fs_context	= ext4_init_fs_context,
 137	.parameters		= ext4_param_specs,
 138	.kill_sb		= kill_block_super,
 139	.fs_flags		= FS_REQUIRES_DEV,
 140};
 141MODULE_ALIAS_FS("ext2");
 142MODULE_ALIAS("ext2");
 143#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 144#else
 145#define IS_EXT2_SB(sb) (0)
 146#endif
 147
 148
 149static struct file_system_type ext3_fs_type = {
 150	.owner			= THIS_MODULE,
 151	.name			= "ext3",
 152	.init_fs_context	= ext4_init_fs_context,
 153	.parameters		= ext4_param_specs,
 154	.kill_sb		= kill_block_super,
 155	.fs_flags		= FS_REQUIRES_DEV,
 156};
 157MODULE_ALIAS_FS("ext3");
 158MODULE_ALIAS("ext3");
 159#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 160
 161
 162static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
 163				  bh_end_io_t *end_io)
 164{
 165	/*
 166	 * buffer's verified bit is no longer valid after reading from
 167	 * disk again due to write out error, clear it to make sure we
 168	 * recheck the buffer contents.
 169	 */
 170	clear_buffer_verified(bh);
 171
 172	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
 173	get_bh(bh);
 174	submit_bh(REQ_OP_READ | op_flags, bh);
 175}
 176
 177void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
 178			 bh_end_io_t *end_io)
 179{
 180	BUG_ON(!buffer_locked(bh));
 181
 182	if (ext4_buffer_uptodate(bh)) {
 183		unlock_buffer(bh);
 184		return;
 185	}
 186	__ext4_read_bh(bh, op_flags, end_io);
 187}
 188
 189int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
 190{
 191	BUG_ON(!buffer_locked(bh));
 192
 193	if (ext4_buffer_uptodate(bh)) {
 194		unlock_buffer(bh);
 195		return 0;
 196	}
 197
 198	__ext4_read_bh(bh, op_flags, end_io);
 199
 200	wait_on_buffer(bh);
 201	if (buffer_uptodate(bh))
 202		return 0;
 203	return -EIO;
 204}
 205
 206int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
 207{
 208	lock_buffer(bh);
 209	if (!wait) {
 210		ext4_read_bh_nowait(bh, op_flags, NULL);
 211		return 0;
 212	}
 213	return ext4_read_bh(bh, op_flags, NULL);
 214}
 215
 216/*
 217 * This works like __bread_gfp() except it uses ERR_PTR for error
 218 * returns.  Currently with sb_bread it's impossible to distinguish
 219 * between ENOMEM and EIO situations (since both result in a NULL
 220 * return.
 221 */
 222static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 223					       sector_t block,
 224					       blk_opf_t op_flags, gfp_t gfp)
 225{
 226	struct buffer_head *bh;
 227	int ret;
 228
 229	bh = sb_getblk_gfp(sb, block, gfp);
 230	if (bh == NULL)
 231		return ERR_PTR(-ENOMEM);
 232	if (ext4_buffer_uptodate(bh))
 233		return bh;
 234
 235	ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
 236	if (ret) {
 237		put_bh(bh);
 238		return ERR_PTR(ret);
 239	}
 240	return bh;
 241}
 242
 243struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 244				   blk_opf_t op_flags)
 245{
 246	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
 247}
 248
 249struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 250					    sector_t block)
 251{
 252	return __ext4_sb_bread_gfp(sb, block, 0, 0);
 253}
 254
 255void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
 256{
 257	struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
 258
 259	if (likely(bh)) {
 260		if (trylock_buffer(bh))
 261			ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
 262		brelse(bh);
 263	}
 264}
 265
 266static int ext4_verify_csum_type(struct super_block *sb,
 267				 struct ext4_super_block *es)
 268{
 269	if (!ext4_has_feature_metadata_csum(sb))
 270		return 1;
 271
 272	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 273}
 274
 275__le32 ext4_superblock_csum(struct super_block *sb,
 276			    struct ext4_super_block *es)
 277{
 278	struct ext4_sb_info *sbi = EXT4_SB(sb);
 279	int offset = offsetof(struct ext4_super_block, s_checksum);
 280	__u32 csum;
 281
 282	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 283
 284	return cpu_to_le32(csum);
 285}
 286
 287static int ext4_superblock_csum_verify(struct super_block *sb,
 288				       struct ext4_super_block *es)
 289{
 290	if (!ext4_has_metadata_csum(sb))
 291		return 1;
 292
 293	return es->s_checksum == ext4_superblock_csum(sb, es);
 294}
 295
 296void ext4_superblock_csum_set(struct super_block *sb)
 297{
 298	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 299
 300	if (!ext4_has_metadata_csum(sb))
 301		return;
 302
 303	es->s_checksum = ext4_superblock_csum(sb, es);
 304}
 305
 306ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 307			       struct ext4_group_desc *bg)
 308{
 309	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 310		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 311		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 312}
 313
 314ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 315			       struct ext4_group_desc *bg)
 316{
 317	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 318		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 319		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 320}
 321
 322ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 323			      struct ext4_group_desc *bg)
 324{
 325	return le32_to_cpu(bg->bg_inode_table_lo) |
 326		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 327		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 328}
 329
 330__u32 ext4_free_group_clusters(struct super_block *sb,
 331			       struct ext4_group_desc *bg)
 332{
 333	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 334		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 335		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 336}
 337
 338__u32 ext4_free_inodes_count(struct super_block *sb,
 339			      struct ext4_group_desc *bg)
 340{
 341	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 342		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 343		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 344}
 345
 346__u32 ext4_used_dirs_count(struct super_block *sb,
 347			      struct ext4_group_desc *bg)
 348{
 349	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 350		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 351		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 352}
 353
 354__u32 ext4_itable_unused_count(struct super_block *sb,
 355			      struct ext4_group_desc *bg)
 356{
 357	return le16_to_cpu(bg->bg_itable_unused_lo) |
 358		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 359		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 360}
 361
 362void ext4_block_bitmap_set(struct super_block *sb,
 363			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 364{
 365	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 366	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 367		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 368}
 369
 370void ext4_inode_bitmap_set(struct super_block *sb,
 371			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 372{
 373	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 374	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 375		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 376}
 377
 378void ext4_inode_table_set(struct super_block *sb,
 379			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
 380{
 381	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 382	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 383		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 384}
 385
 386void ext4_free_group_clusters_set(struct super_block *sb,
 387				  struct ext4_group_desc *bg, __u32 count)
 388{
 389	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 390	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 391		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 392}
 393
 394void ext4_free_inodes_set(struct super_block *sb,
 395			  struct ext4_group_desc *bg, __u32 count)
 396{
 397	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 398	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 399		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 400}
 401
 402void ext4_used_dirs_set(struct super_block *sb,
 403			  struct ext4_group_desc *bg, __u32 count)
 404{
 405	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 406	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 407		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 408}
 409
 410void ext4_itable_unused_set(struct super_block *sb,
 411			  struct ext4_group_desc *bg, __u32 count)
 412{
 413	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 414	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 415		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 416}
 417
 418static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
 419{
 420	now = clamp_val(now, 0, (1ull << 40) - 1);
 421
 422	*lo = cpu_to_le32(lower_32_bits(now));
 423	*hi = upper_32_bits(now);
 424}
 425
 426static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 427{
 428	return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 429}
 430#define ext4_update_tstamp(es, tstamp) \
 431	__ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
 432			     ktime_get_real_seconds())
 433#define ext4_get_tstamp(es, tstamp) \
 434	__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 435
 436/*
 437 * The del_gendisk() function uninitializes the disk-specific data
 438 * structures, including the bdi structure, without telling anyone
 439 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 440 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 441 * This is a kludge to prevent these oops until we can put in a proper
 442 * hook in del_gendisk() to inform the VFS and file system layers.
 443 */
 444static int block_device_ejected(struct super_block *sb)
 445{
 446	struct inode *bd_inode = sb->s_bdev->bd_inode;
 447	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 448
 449	return bdi->dev == NULL;
 450}
 451
 452static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 453{
 454	struct super_block		*sb = journal->j_private;
 455	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 456	int				error = is_journal_aborted(journal);
 457	struct ext4_journal_cb_entry	*jce;
 458
 459	BUG_ON(txn->t_state == T_FINISHED);
 460
 461	ext4_process_freed_data(sb, txn->t_tid);
 462
 463	spin_lock(&sbi->s_md_lock);
 464	while (!list_empty(&txn->t_private_list)) {
 465		jce = list_entry(txn->t_private_list.next,
 466				 struct ext4_journal_cb_entry, jce_list);
 467		list_del_init(&jce->jce_list);
 468		spin_unlock(&sbi->s_md_lock);
 469		jce->jce_func(sb, jce, error);
 470		spin_lock(&sbi->s_md_lock);
 471	}
 472	spin_unlock(&sbi->s_md_lock);
 473}
 474
 475/*
 476 * This writepage callback for write_cache_pages()
 477 * takes care of a few cases after page cleaning.
 478 *
 479 * write_cache_pages() already checks for dirty pages
 480 * and calls clear_page_dirty_for_io(), which we want,
 481 * to write protect the pages.
 482 *
 483 * However, we may have to redirty a page (see below.)
 484 */
 485static int ext4_journalled_writepage_callback(struct folio *folio,
 486					      struct writeback_control *wbc,
 487					      void *data)
 488{
 489	transaction_t *transaction = (transaction_t *) data;
 490	struct buffer_head *bh, *head;
 491	struct journal_head *jh;
 492
 493	bh = head = folio_buffers(folio);
 494	do {
 495		/*
 496		 * We have to redirty a page in these cases:
 497		 * 1) If buffer is dirty, it means the page was dirty because it
 498		 * contains a buffer that needs checkpointing. So the dirty bit
 499		 * needs to be preserved so that checkpointing writes the buffer
 500		 * properly.
 501		 * 2) If buffer is not part of the committing transaction
 502		 * (we may have just accidentally come across this buffer because
 503		 * inode range tracking is not exact) or if the currently running
 504		 * transaction already contains this buffer as well, dirty bit
 505		 * needs to be preserved so that the buffer gets writeprotected
 506		 * properly on running transaction's commit.
 507		 */
 508		jh = bh2jh(bh);
 509		if (buffer_dirty(bh) ||
 510		    (jh && (jh->b_transaction != transaction ||
 511			    jh->b_next_transaction))) {
 512			folio_redirty_for_writepage(wbc, folio);
 513			goto out;
 514		}
 515	} while ((bh = bh->b_this_page) != head);
 516
 517out:
 518	return AOP_WRITEPAGE_ACTIVATE;
 519}
 520
 521static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
 522{
 523	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 524	struct writeback_control wbc = {
 525		.sync_mode =  WB_SYNC_ALL,
 526		.nr_to_write = LONG_MAX,
 527		.range_start = jinode->i_dirty_start,
 528		.range_end = jinode->i_dirty_end,
 529        };
 530
 531	return write_cache_pages(mapping, &wbc,
 532				 ext4_journalled_writepage_callback,
 533				 jinode->i_transaction);
 534}
 535
 536static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 537{
 538	int ret;
 539
 540	if (ext4_should_journal_data(jinode->i_vfs_inode))
 541		ret = ext4_journalled_submit_inode_data_buffers(jinode);
 542	else
 543		ret = ext4_normal_submit_inode_data_buffers(jinode);
 544	return ret;
 545}
 546
 547static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
 548{
 549	int ret = 0;
 550
 551	if (!ext4_should_journal_data(jinode->i_vfs_inode))
 552		ret = jbd2_journal_finish_inode_data_buffers(jinode);
 553
 554	return ret;
 555}
 556
 557static bool system_going_down(void)
 558{
 559	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
 560		|| system_state == SYSTEM_RESTART;
 561}
 562
 563struct ext4_err_translation {
 564	int code;
 565	int errno;
 566};
 567
 568#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
 569
 570static struct ext4_err_translation err_translation[] = {
 571	EXT4_ERR_TRANSLATE(EIO),
 572	EXT4_ERR_TRANSLATE(ENOMEM),
 573	EXT4_ERR_TRANSLATE(EFSBADCRC),
 574	EXT4_ERR_TRANSLATE(EFSCORRUPTED),
 575	EXT4_ERR_TRANSLATE(ENOSPC),
 576	EXT4_ERR_TRANSLATE(ENOKEY),
 577	EXT4_ERR_TRANSLATE(EROFS),
 578	EXT4_ERR_TRANSLATE(EFBIG),
 579	EXT4_ERR_TRANSLATE(EEXIST),
 580	EXT4_ERR_TRANSLATE(ERANGE),
 581	EXT4_ERR_TRANSLATE(EOVERFLOW),
 582	EXT4_ERR_TRANSLATE(EBUSY),
 583	EXT4_ERR_TRANSLATE(ENOTDIR),
 584	EXT4_ERR_TRANSLATE(ENOTEMPTY),
 585	EXT4_ERR_TRANSLATE(ESHUTDOWN),
 586	EXT4_ERR_TRANSLATE(EFAULT),
 587};
 588
 589static int ext4_errno_to_code(int errno)
 590{
 591	int i;
 592
 593	for (i = 0; i < ARRAY_SIZE(err_translation); i++)
 594		if (err_translation[i].errno == errno)
 595			return err_translation[i].code;
 596	return EXT4_ERR_UNKNOWN;
 597}
 598
 599static void save_error_info(struct super_block *sb, int error,
 600			    __u32 ino, __u64 block,
 601			    const char *func, unsigned int line)
 602{
 603	struct ext4_sb_info *sbi = EXT4_SB(sb);
 604
 605	/* We default to EFSCORRUPTED error... */
 606	if (error == 0)
 607		error = EFSCORRUPTED;
 608
 609	spin_lock(&sbi->s_error_lock);
 610	sbi->s_add_error_count++;
 611	sbi->s_last_error_code = error;
 612	sbi->s_last_error_line = line;
 613	sbi->s_last_error_ino = ino;
 614	sbi->s_last_error_block = block;
 615	sbi->s_last_error_func = func;
 616	sbi->s_last_error_time = ktime_get_real_seconds();
 617	if (!sbi->s_first_error_time) {
 618		sbi->s_first_error_code = error;
 619		sbi->s_first_error_line = line;
 620		sbi->s_first_error_ino = ino;
 621		sbi->s_first_error_block = block;
 622		sbi->s_first_error_func = func;
 623		sbi->s_first_error_time = sbi->s_last_error_time;
 624	}
 625	spin_unlock(&sbi->s_error_lock);
 626}
 627
 628/* Deal with the reporting of failure conditions on a filesystem such as
 629 * inconsistencies detected or read IO failures.
 630 *
 631 * On ext2, we can store the error state of the filesystem in the
 632 * superblock.  That is not possible on ext4, because we may have other
 633 * write ordering constraints on the superblock which prevent us from
 634 * writing it out straight away; and given that the journal is about to
 635 * be aborted, we can't rely on the current, or future, transactions to
 636 * write out the superblock safely.
 637 *
 638 * We'll just use the jbd2_journal_abort() error code to record an error in
 639 * the journal instead.  On recovery, the journal will complain about
 640 * that error until we've noted it down and cleared it.
 641 *
 642 * If force_ro is set, we unconditionally force the filesystem into an
 643 * ABORT|READONLY state, unless the error response on the fs has been set to
 644 * panic in which case we take the easy way out and panic immediately. This is
 645 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 646 * at a critical moment in log management.
 647 */
 648static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
 649			      __u32 ino, __u64 block,
 650			      const char *func, unsigned int line)
 651{
 652	journal_t *journal = EXT4_SB(sb)->s_journal;
 653	bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
 654
 655	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 656	if (test_opt(sb, WARN_ON_ERROR))
 657		WARN_ON_ONCE(1);
 658
 659	if (!continue_fs && !sb_rdonly(sb)) {
 660		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
 661		if (journal)
 662			jbd2_journal_abort(journal, -EIO);
 663	}
 664
 665	if (!bdev_read_only(sb->s_bdev)) {
 666		save_error_info(sb, error, ino, block, func, line);
 667		/*
 668		 * In case the fs should keep running, we need to writeout
 669		 * superblock through the journal. Due to lock ordering
 670		 * constraints, it may not be safe to do it right here so we
 671		 * defer superblock flushing to a workqueue.
 672		 */
 673		if (continue_fs && journal)
 674			schedule_work(&EXT4_SB(sb)->s_error_work);
 675		else
 676			ext4_commit_super(sb);
 677	}
 678
 679	/*
 680	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
 681	 * could panic during 'reboot -f' as the underlying device got already
 682	 * disabled.
 683	 */
 684	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 685		panic("EXT4-fs (device %s): panic forced after error\n",
 686			sb->s_id);
 687	}
 688
 689	if (sb_rdonly(sb) || continue_fs)
 690		return;
 691
 692	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 693	/*
 694	 * Make sure updated value of ->s_mount_flags will be visible before
 695	 * ->s_flags update
 696	 */
 697	smp_wmb();
 698	sb->s_flags |= SB_RDONLY;
 699}
 700
 701static void flush_stashed_error_work(struct work_struct *work)
 702{
 703	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
 704						s_error_work);
 705	journal_t *journal = sbi->s_journal;
 706	handle_t *handle;
 707
 708	/*
 709	 * If the journal is still running, we have to write out superblock
 710	 * through the journal to avoid collisions of other journalled sb
 711	 * updates.
 712	 *
 713	 * We use directly jbd2 functions here to avoid recursing back into
 714	 * ext4 error handling code during handling of previous errors.
 715	 */
 716	if (!sb_rdonly(sbi->s_sb) && journal) {
 717		struct buffer_head *sbh = sbi->s_sbh;
 718		handle = jbd2_journal_start(journal, 1);
 719		if (IS_ERR(handle))
 720			goto write_directly;
 721		if (jbd2_journal_get_write_access(handle, sbh)) {
 722			jbd2_journal_stop(handle);
 723			goto write_directly;
 724		}
 725		ext4_update_super(sbi->s_sb);
 726		if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
 727			ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
 728				 "superblock detected");
 729			clear_buffer_write_io_error(sbh);
 730			set_buffer_uptodate(sbh);
 731		}
 732
 733		if (jbd2_journal_dirty_metadata(handle, sbh)) {
 734			jbd2_journal_stop(handle);
 735			goto write_directly;
 736		}
 737		jbd2_journal_stop(handle);
 738		ext4_notify_error_sysfs(sbi);
 739		return;
 740	}
 741write_directly:
 742	/*
 743	 * Write through journal failed. Write sb directly to get error info
 744	 * out and hope for the best.
 745	 */
 746	ext4_commit_super(sbi->s_sb);
 747	ext4_notify_error_sysfs(sbi);
 748}
 749
 750#define ext4_error_ratelimit(sb)					\
 751		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
 752			     "EXT4-fs error")
 753
 754void __ext4_error(struct super_block *sb, const char *function,
 755		  unsigned int line, bool force_ro, int error, __u64 block,
 756		  const char *fmt, ...)
 757{
 758	struct va_format vaf;
 759	va_list args;
 760
 761	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 762		return;
 763
 764	trace_ext4_error(sb, function, line);
 765	if (ext4_error_ratelimit(sb)) {
 766		va_start(args, fmt);
 767		vaf.fmt = fmt;
 768		vaf.va = &args;
 769		printk(KERN_CRIT
 770		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 771		       sb->s_id, function, line, current->comm, &vaf);
 772		va_end(args);
 773	}
 774	fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
 775
 776	ext4_handle_error(sb, force_ro, error, 0, block, function, line);
 777}
 778
 779void __ext4_error_inode(struct inode *inode, const char *function,
 780			unsigned int line, ext4_fsblk_t block, int error,
 781			const char *fmt, ...)
 782{
 783	va_list args;
 784	struct va_format vaf;
 785
 786	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 787		return;
 788
 789	trace_ext4_error(inode->i_sb, function, line);
 790	if (ext4_error_ratelimit(inode->i_sb)) {
 791		va_start(args, fmt);
 792		vaf.fmt = fmt;
 793		vaf.va = &args;
 794		if (block)
 795			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 796			       "inode #%lu: block %llu: comm %s: %pV\n",
 797			       inode->i_sb->s_id, function, line, inode->i_ino,
 798			       block, current->comm, &vaf);
 799		else
 800			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 801			       "inode #%lu: comm %s: %pV\n",
 802			       inode->i_sb->s_id, function, line, inode->i_ino,
 803			       current->comm, &vaf);
 804		va_end(args);
 805	}
 806	fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
 807
 808	ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
 809			  function, line);
 810}
 811
 812void __ext4_error_file(struct file *file, const char *function,
 813		       unsigned int line, ext4_fsblk_t block,
 814		       const char *fmt, ...)
 815{
 816	va_list args;
 817	struct va_format vaf;
 818	struct inode *inode = file_inode(file);
 819	char pathname[80], *path;
 820
 821	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 822		return;
 823
 824	trace_ext4_error(inode->i_sb, function, line);
 825	if (ext4_error_ratelimit(inode->i_sb)) {
 826		path = file_path(file, pathname, sizeof(pathname));
 827		if (IS_ERR(path))
 828			path = "(unknown)";
 829		va_start(args, fmt);
 830		vaf.fmt = fmt;
 831		vaf.va = &args;
 832		if (block)
 833			printk(KERN_CRIT
 834			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 835			       "block %llu: comm %s: path %s: %pV\n",
 836			       inode->i_sb->s_id, function, line, inode->i_ino,
 837			       block, current->comm, path, &vaf);
 838		else
 839			printk(KERN_CRIT
 840			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 841			       "comm %s: path %s: %pV\n",
 842			       inode->i_sb->s_id, function, line, inode->i_ino,
 843			       current->comm, path, &vaf);
 844		va_end(args);
 845	}
 846	fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
 847
 848	ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
 849			  function, line);
 850}
 851
 852const char *ext4_decode_error(struct super_block *sb, int errno,
 853			      char nbuf[16])
 854{
 855	char *errstr = NULL;
 856
 857	switch (errno) {
 858	case -EFSCORRUPTED:
 859		errstr = "Corrupt filesystem";
 860		break;
 861	case -EFSBADCRC:
 862		errstr = "Filesystem failed CRC";
 863		break;
 864	case -EIO:
 865		errstr = "IO failure";
 866		break;
 867	case -ENOMEM:
 868		errstr = "Out of memory";
 869		break;
 870	case -EROFS:
 871		if (!sb || (EXT4_SB(sb)->s_journal &&
 872			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 873			errstr = "Journal has aborted";
 874		else
 875			errstr = "Readonly filesystem";
 876		break;
 877	default:
 878		/* If the caller passed in an extra buffer for unknown
 879		 * errors, textualise them now.  Else we just return
 880		 * NULL. */
 881		if (nbuf) {
 882			/* Check for truncated error codes... */
 883			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 884				errstr = nbuf;
 885		}
 886		break;
 887	}
 888
 889	return errstr;
 890}
 891
 892/* __ext4_std_error decodes expected errors from journaling functions
 893 * automatically and invokes the appropriate error response.  */
 894
 895void __ext4_std_error(struct super_block *sb, const char *function,
 896		      unsigned int line, int errno)
 897{
 898	char nbuf[16];
 899	const char *errstr;
 900
 901	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 902		return;
 903
 904	/* Special case: if the error is EROFS, and we're not already
 905	 * inside a transaction, then there's really no point in logging
 906	 * an error. */
 907	if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
 908		return;
 909
 910	if (ext4_error_ratelimit(sb)) {
 911		errstr = ext4_decode_error(sb, errno, nbuf);
 912		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 913		       sb->s_id, function, line, errstr);
 914	}
 915	fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
 916
 917	ext4_handle_error(sb, false, -errno, 0, 0, function, line);
 918}
 919
 920void __ext4_msg(struct super_block *sb,
 921		const char *prefix, const char *fmt, ...)
 922{
 923	struct va_format vaf;
 924	va_list args;
 925
 926	if (sb) {
 927		atomic_inc(&EXT4_SB(sb)->s_msg_count);
 928		if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
 929				  "EXT4-fs"))
 930			return;
 931	}
 932
 933	va_start(args, fmt);
 934	vaf.fmt = fmt;
 935	vaf.va = &args;
 936	if (sb)
 937		printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 938	else
 939		printk("%sEXT4-fs: %pV\n", prefix, &vaf);
 940	va_end(args);
 941}
 942
 943static int ext4_warning_ratelimit(struct super_block *sb)
 944{
 945	atomic_inc(&EXT4_SB(sb)->s_warning_count);
 946	return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 947			    "EXT4-fs warning");
 948}
 949
 950void __ext4_warning(struct super_block *sb, const char *function,
 951		    unsigned int line, const char *fmt, ...)
 952{
 953	struct va_format vaf;
 954	va_list args;
 955
 956	if (!ext4_warning_ratelimit(sb))
 957		return;
 958
 959	va_start(args, fmt);
 960	vaf.fmt = fmt;
 961	vaf.va = &args;
 962	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 963	       sb->s_id, function, line, &vaf);
 964	va_end(args);
 965}
 966
 967void __ext4_warning_inode(const struct inode *inode, const char *function,
 968			  unsigned int line, const char *fmt, ...)
 969{
 970	struct va_format vaf;
 971	va_list args;
 972
 973	if (!ext4_warning_ratelimit(inode->i_sb))
 974		return;
 975
 976	va_start(args, fmt);
 977	vaf.fmt = fmt;
 978	vaf.va = &args;
 979	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 980	       "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 981	       function, line, inode->i_ino, current->comm, &vaf);
 982	va_end(args);
 983}
 984
 985void __ext4_grp_locked_error(const char *function, unsigned int line,
 986			     struct super_block *sb, ext4_group_t grp,
 987			     unsigned long ino, ext4_fsblk_t block,
 988			     const char *fmt, ...)
 989__releases(bitlock)
 990__acquires(bitlock)
 991{
 992	struct va_format vaf;
 993	va_list args;
 994
 995	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 996		return;
 997
 998	trace_ext4_error(sb, function, line);
 999	if (ext4_error_ratelimit(sb)) {
1000		va_start(args, fmt);
1001		vaf.fmt = fmt;
1002		vaf.va = &args;
1003		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
1004		       sb->s_id, function, line, grp);
1005		if (ino)
1006			printk(KERN_CONT "inode %lu: ", ino);
1007		if (block)
1008			printk(KERN_CONT "block %llu:",
1009			       (unsigned long long) block);
1010		printk(KERN_CONT "%pV\n", &vaf);
1011		va_end(args);
1012	}
1013
1014	if (test_opt(sb, ERRORS_CONT)) {
1015		if (test_opt(sb, WARN_ON_ERROR))
1016			WARN_ON_ONCE(1);
1017		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
1018		if (!bdev_read_only(sb->s_bdev)) {
1019			save_error_info(sb, EFSCORRUPTED, ino, block, function,
1020					line);
1021			schedule_work(&EXT4_SB(sb)->s_error_work);
1022		}
1023		return;
1024	}
1025	ext4_unlock_group(sb, grp);
1026	ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
1027	/*
1028	 * We only get here in the ERRORS_RO case; relocking the group
1029	 * may be dangerous, but nothing bad will happen since the
1030	 * filesystem will have already been marked read/only and the
1031	 * journal has been aborted.  We return 1 as a hint to callers
1032	 * who might what to use the return value from
1033	 * ext4_grp_locked_error() to distinguish between the
1034	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
1035	 * aggressively from the ext4 function in question, with a
1036	 * more appropriate error code.
1037	 */
1038	ext4_lock_group(sb, grp);
1039	return;
1040}
1041
1042void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
1043				     ext4_group_t group,
1044				     unsigned int flags)
1045{
1046	struct ext4_sb_info *sbi = EXT4_SB(sb);
1047	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1048	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
1049	int ret;
1050
1051	if (!grp || !gdp)
1052		return;
1053	if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
1054		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1055					    &grp->bb_state);
1056		if (!ret)
1057			percpu_counter_sub(&sbi->s_freeclusters_counter,
1058					   grp->bb_free);
1059	}
1060
1061	if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
1062		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
1063					    &grp->bb_state);
1064		if (!ret && gdp) {
1065			int count;
1066
1067			count = ext4_free_inodes_count(sb, gdp);
1068			percpu_counter_sub(&sbi->s_freeinodes_counter,
1069					   count);
1070		}
1071	}
1072}
1073
1074void ext4_update_dynamic_rev(struct super_block *sb)
1075{
1076	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1077
1078	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
1079		return;
1080
1081	ext4_warning(sb,
1082		     "updating to rev %d because of new feature flag, "
1083		     "running e2fsck is recommended",
1084		     EXT4_DYNAMIC_REV);
1085
1086	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
1087	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
1088	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
1089	/* leave es->s_feature_*compat flags alone */
1090	/* es->s_uuid will be set by e2fsck if empty */
1091
1092	/*
1093	 * The rest of the superblock fields should be zero, and if not it
1094	 * means they are likely already in use, so leave them alone.  We
1095	 * can leave it up to e2fsck to clean up any inconsistencies there.
1096	 */
1097}
1098
1099/*
1100 * Open the external journal device
1101 */
1102static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
1103{
1104	struct block_device *bdev;
1105
1106	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
1107	if (IS_ERR(bdev))
1108		goto fail;
1109	return bdev;
1110
1111fail:
1112	ext4_msg(sb, KERN_ERR,
1113		 "failed to open journal device unknown-block(%u,%u) %ld",
1114		 MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
1115	return NULL;
1116}
1117
1118/*
1119 * Release the journal device
1120 */
1121static void ext4_blkdev_put(struct block_device *bdev)
1122{
1123	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1124}
1125
1126static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
1127{
1128	struct block_device *bdev;
1129	bdev = sbi->s_journal_bdev;
1130	if (bdev) {
1131		ext4_blkdev_put(bdev);
1132		sbi->s_journal_bdev = NULL;
1133	}
1134}
1135
1136static inline struct inode *orphan_list_entry(struct list_head *l)
1137{
1138	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
1139}
1140
1141static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
1142{
1143	struct list_head *l;
1144
1145	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
1146		 le32_to_cpu(sbi->s_es->s_last_orphan));
1147
1148	printk(KERN_ERR "sb_info orphan list:\n");
1149	list_for_each(l, &sbi->s_orphan) {
1150		struct inode *inode = orphan_list_entry(l);
1151		printk(KERN_ERR "  "
1152		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
1153		       inode->i_sb->s_id, inode->i_ino, inode,
1154		       inode->i_mode, inode->i_nlink,
1155		       NEXT_ORPHAN(inode));
1156	}
1157}
1158
1159#ifdef CONFIG_QUOTA
1160static int ext4_quota_off(struct super_block *sb, int type);
1161
1162static inline void ext4_quota_off_umount(struct super_block *sb)
1163{
1164	int type;
1165
1166	/* Use our quota_off function to clear inode flags etc. */
1167	for (type = 0; type < EXT4_MAXQUOTAS; type++)
1168		ext4_quota_off(sb, type);
1169}
1170
1171/*
1172 * This is a helper function which is used in the mount/remount
1173 * codepaths (which holds s_umount) to fetch the quota file name.
1174 */
1175static inline char *get_qf_name(struct super_block *sb,
1176				struct ext4_sb_info *sbi,
1177				int type)
1178{
1179	return rcu_dereference_protected(sbi->s_qf_names[type],
1180					 lockdep_is_held(&sb->s_umount));
1181}
1182#else
1183static inline void ext4_quota_off_umount(struct super_block *sb)
1184{
1185}
1186#endif
1187
1188static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
1189{
1190	ext4_fsblk_t block;
1191	int err;
1192
1193	block = ext4_count_free_clusters(sbi->s_sb);
1194	ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
1195	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
1196				  GFP_KERNEL);
1197	if (!err) {
1198		unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
1199		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
1200		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
1201					  GFP_KERNEL);
1202	}
1203	if (!err)
1204		err = percpu_counter_init(&sbi->s_dirs_counter,
1205					  ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
1206	if (!err)
1207		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
1208					  GFP_KERNEL);
1209	if (!err)
1210		err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
1211					  GFP_KERNEL);
1212	if (!err)
1213		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
1214
1215	if (err)
1216		ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");
1217
1218	return err;
1219}
1220
1221static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
1222{
1223	percpu_counter_destroy(&sbi->s_freeclusters_counter);
1224	percpu_counter_destroy(&sbi->s_freeinodes_counter);
1225	percpu_counter_destroy(&sbi->s_dirs_counter);
1226	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1227	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
1228	percpu_free_rwsem(&sbi->s_writepages_rwsem);
1229}
1230
1231static void ext4_group_desc_free(struct ext4_sb_info *sbi)
1232{
1233	struct buffer_head **group_desc;
1234	int i;
1235
1236	rcu_read_lock();
1237	group_desc = rcu_dereference(sbi->s_group_desc);
1238	for (i = 0; i < sbi->s_gdb_count; i++)
1239		brelse(group_desc[i]);
1240	kvfree(group_desc);
1241	rcu_read_unlock();
1242}
1243
1244static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
1245{
1246	struct flex_groups **flex_groups;
1247	int i;
1248
1249	rcu_read_lock();
1250	flex_groups = rcu_dereference(sbi->s_flex_groups);
1251	if (flex_groups) {
1252		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1253			kvfree(flex_groups[i]);
1254		kvfree(flex_groups);
1255	}
1256	rcu_read_unlock();
1257}
1258
1259static void ext4_put_super(struct super_block *sb)
1260{
1261	struct ext4_sb_info *sbi = EXT4_SB(sb);
1262	struct ext4_super_block *es = sbi->s_es;
1263	int aborted = 0;
1264	int err;
1265
1266	/*
1267	 * Unregister sysfs before destroying jbd2 journal.
1268	 * Since we could still access attr_journal_task attribute via sysfs
1269	 * path which could have sbi->s_journal->j_task as NULL
1270	 * Unregister sysfs before flush sbi->s_error_work.
1271	 * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
1272	 * read metadata verify failed then will queue error work.
1273	 * flush_stashed_error_work will call start_this_handle may trigger
1274	 * BUG_ON.
1275	 */
1276	ext4_unregister_sysfs(sb);
1277
1278	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
1279		ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
1280			 &sb->s_uuid);
1281
1282	ext4_unregister_li_request(sb);
1283	ext4_quota_off_umount(sb);
1284
1285	flush_work(&sbi->s_error_work);
1286	destroy_workqueue(sbi->rsv_conversion_wq);
1287	ext4_release_orphan_info(sb);
1288
1289	if (sbi->s_journal) {
1290		aborted = is_journal_aborted(sbi->s_journal);
1291		err = jbd2_journal_destroy(sbi->s_journal);
1292		sbi->s_journal = NULL;
1293		if ((err < 0) && !aborted) {
1294			ext4_abort(sb, -err, "Couldn't clean up the journal");
1295		}
1296	}
1297
1298	ext4_es_unregister_shrinker(sbi);
1299	timer_shutdown_sync(&sbi->s_err_report);
1300	ext4_release_system_zone(sb);
1301	ext4_mb_release(sb);
1302	ext4_ext_release(sb);
1303
1304	if (!sb_rdonly(sb) && !aborted) {
1305		ext4_clear_feature_journal_needs_recovery(sb);
1306		ext4_clear_feature_orphan_present(sb);
1307		es->s_state = cpu_to_le16(sbi->s_mount_state);
1308	}
1309	if (!sb_rdonly(sb))
1310		ext4_commit_super(sb);
1311
1312	ext4_group_desc_free(sbi);
1313	ext4_flex_groups_free(sbi);
1314	ext4_percpu_param_destroy(sbi);
1315#ifdef CONFIG_QUOTA
1316	for (int i = 0; i < EXT4_MAXQUOTAS; i++)
1317		kfree(get_qf_name(sb, sbi, i));
1318#endif
1319
1320	/* Debugging code just in case the in-memory inode orphan list
1321	 * isn't empty.  The on-disk one can be non-empty if we've
1322	 * detected an error and taken the fs readonly, but the
1323	 * in-memory list had better be clean by this point. */
1324	if (!list_empty(&sbi->s_orphan))
1325		dump_orphan_list(sb, sbi);
1326	ASSERT(list_empty(&sbi->s_orphan));
1327
1328	sync_blockdev(sb->s_bdev);
1329	invalidate_bdev(sb->s_bdev);
1330	if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
1331		/*
1332		 * Invalidate the journal device's buffers.  We don't want them
1333		 * floating about in memory - the physical journal device may
1334		 * hotswapped, and it breaks the `ro-after' testing code.
1335		 */
1336		sync_blockdev(sbi->s_journal_bdev);
1337		invalidate_bdev(sbi->s_journal_bdev);
1338		ext4_blkdev_remove(sbi);
1339	}
1340
1341	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1342	sbi->s_ea_inode_cache = NULL;
1343
1344	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1345	sbi->s_ea_block_cache = NULL;
1346
1347	ext4_stop_mmpd(sbi);
1348
1349	brelse(sbi->s_sbh);
1350	sb->s_fs_info = NULL;
1351	/*
1352	 * Now that we are completely done shutting down the
1353	 * superblock, we need to actually destroy the kobject.
1354	 */
1355	kobject_put(&sbi->s_kobj);
1356	wait_for_completion(&sbi->s_kobj_unregister);
1357	if (sbi->s_chksum_driver)
1358		crypto_free_shash(sbi->s_chksum_driver);
1359	kfree(sbi->s_blockgroup_lock);
1360	fs_put_dax(sbi->s_daxdev, NULL);
1361	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
1362#if IS_ENABLED(CONFIG_UNICODE)
1363	utf8_unload(sb->s_encoding);
1364#endif
1365	kfree(sbi);
1366}
1367
1368static struct kmem_cache *ext4_inode_cachep;
1369
1370/*
1371 * Called inside transaction, so use GFP_NOFS
1372 */
1373static struct inode *ext4_alloc_inode(struct super_block *sb)
1374{
1375	struct ext4_inode_info *ei;
1376
1377	ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
1378	if (!ei)
1379		return NULL;
1380
1381	inode_set_iversion(&ei->vfs_inode, 1);
1382	ei->i_flags = 0;
1383	spin_lock_init(&ei->i_raw_lock);
1384	ei->i_prealloc_node = RB_ROOT;
1385	atomic_set(&ei->i_prealloc_active, 0);
1386	rwlock_init(&ei->i_prealloc_lock);
1387	ext4_es_init_tree(&ei->i_es_tree);
1388	rwlock_init(&ei->i_es_lock);
1389	INIT_LIST_HEAD(&ei->i_es_list);
1390	ei->i_es_all_nr = 0;
1391	ei->i_es_shk_nr = 0;
1392	ei->i_es_shrink_lblk = 0;
1393	ei->i_reserved_data_blocks = 0;
1394	spin_lock_init(&(ei->i_block_reservation_lock));
1395	ext4_init_pending_tree(&ei->i_pending_tree);
1396#ifdef CONFIG_QUOTA
1397	ei->i_reserved_quota = 0;
1398	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1399#endif
1400	ei->jinode = NULL;
1401	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1402	spin_lock_init(&ei->i_completed_io_lock);
1403	ei->i_sync_tid = 0;
1404	ei->i_datasync_tid = 0;
1405	atomic_set(&ei->i_unwritten, 0);
1406	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1407	ext4_fc_init_inode(&ei->vfs_inode);
1408	mutex_init(&ei->i_fc_lock);
1409	return &ei->vfs_inode;
1410}
1411
1412static int ext4_drop_inode(struct inode *inode)
1413{
1414	int drop = generic_drop_inode(inode);
1415
1416	if (!drop)
1417		drop = fscrypt_drop_inode(inode);
1418
1419	trace_ext4_drop_inode(inode, drop);
1420	return drop;
1421}
1422
1423static void ext4_free_in_core_inode(struct inode *inode)
1424{
1425	fscrypt_free_inode(inode);
1426	if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
1427		pr_warn("%s: inode %ld still in fc list",
1428			__func__, inode->i_ino);
1429	}
1430	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1431}
1432
1433static void ext4_destroy_inode(struct inode *inode)
1434{
1435	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1436		ext4_msg(inode->i_sb, KERN_ERR,
1437			 "Inode %lu (%p): orphan list check failed!",
1438			 inode->i_ino, EXT4_I(inode));
1439		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1440				EXT4_I(inode), sizeof(struct ext4_inode_info),
1441				true);
1442		dump_stack();
1443	}
1444
1445	if (EXT4_I(inode)->i_reserved_data_blocks)
1446		ext4_msg(inode->i_sb, KERN_ERR,
1447			 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
1448			 inode->i_ino, EXT4_I(inode),
1449			 EXT4_I(inode)->i_reserved_data_blocks);
1450}
1451
1452static void init_once(void *foo)
1453{
1454	struct ext4_inode_info *ei = foo;
1455
1456	INIT_LIST_HEAD(&ei->i_orphan);
1457	init_rwsem(&ei->xattr_sem);
1458	init_rwsem(&ei->i_data_sem);
1459	inode_init_once(&ei->vfs_inode);
1460	ext4_fc_init_inode(&ei->vfs_inode);
1461}
1462
1463static int __init init_inodecache(void)
1464{
1465	ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1466				sizeof(struct ext4_inode_info), 0,
1467				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1468					SLAB_ACCOUNT),
1469				offsetof(struct ext4_inode_info, i_data),
1470				sizeof_field(struct ext4_inode_info, i_data),
1471				init_once);
1472	if (ext4_inode_cachep == NULL)
1473		return -ENOMEM;
1474	return 0;
1475}
1476
1477static void destroy_inodecache(void)
1478{
1479	/*
1480	 * Make sure all delayed rcu free inodes are flushed before we
1481	 * destroy cache.
1482	 */
1483	rcu_barrier();
1484	kmem_cache_destroy(ext4_inode_cachep);
1485}
1486
1487void ext4_clear_inode(struct inode *inode)
1488{
1489	ext4_fc_del(inode);
1490	invalidate_inode_buffers(inode);
1491	clear_inode(inode);
1492	ext4_discard_preallocations(inode, 0);
1493	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1494	dquot_drop(inode);
1495	if (EXT4_I(inode)->jinode) {
1496		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1497					       EXT4_I(inode)->jinode);
1498		jbd2_free_inode(EXT4_I(inode)->jinode);
1499		EXT4_I(inode)->jinode = NULL;
1500	}
1501	fscrypt_put_encryption_info(inode);
1502	fsverity_cleanup_inode(inode);
1503}
1504
1505static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1506					u64 ino, u32 generation)
1507{
1508	struct inode *inode;
1509
1510	/*
1511	 * Currently we don't know the generation for parent directory, so
1512	 * a generation of 0 means "accept any"
1513	 */
1514	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1515	if (IS_ERR(inode))
1516		return ERR_CAST(inode);
1517	if (generation && inode->i_generation != generation) {
1518		iput(inode);
1519		return ERR_PTR(-ESTALE);
1520	}
1521
1522	return inode;
1523}
1524
1525static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1526					int fh_len, int fh_type)
1527{
1528	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1529				    ext4_nfs_get_inode);
1530}
1531
1532static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1533					int fh_len, int fh_type)
1534{
1535	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1536				    ext4_nfs_get_inode);
1537}
1538
1539static int ext4_nfs_commit_metadata(struct inode *inode)
1540{
1541	struct writeback_control wbc = {
1542		.sync_mode = WB_SYNC_ALL
1543	};
1544
1545	trace_ext4_nfs_commit_metadata(inode);
1546	return ext4_write_inode(inode, &wbc);
1547}
1548
1549#ifdef CONFIG_QUOTA
1550static const char * const quotatypes[] = INITQFNAMES;
1551#define QTYPE2NAME(t) (quotatypes[t])
1552
1553static int ext4_write_dquot(struct dquot *dquot);
1554static int ext4_acquire_dquot(struct dquot *dquot);
1555static int ext4_release_dquot(struct dquot *dquot);
1556static int ext4_mark_dquot_dirty(struct dquot *dquot);
1557static int ext4_write_info(struct super_block *sb, int type);
1558static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1559			 const struct path *path);
1560static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1561			       size_t len, loff_t off);
1562static ssize_t ext4_quota_write(struct super_block *sb, int type,
1563				const char *data, size_t len, loff_t off);
1564static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1565			     unsigned int flags);
1566
1567static struct dquot **ext4_get_dquots(struct inode *inode)
1568{
1569	return EXT4_I(inode)->i_dquot;
1570}
1571
1572static const struct dquot_operations ext4_quota_operations = {
1573	.get_reserved_space	= ext4_get_reserved_space,
1574	.write_dquot		= ext4_write_dquot,
1575	.acquire_dquot		= ext4_acquire_dquot,
1576	.release_dquot		= ext4_release_dquot,
1577	.mark_dirty		= ext4_mark_dquot_dirty,
1578	.write_info		= ext4_write_info,
1579	.alloc_dquot		= dquot_alloc,
1580	.destroy_dquot		= dquot_destroy,
1581	.get_projid		= ext4_get_projid,
1582	.get_inode_usage	= ext4_get_inode_usage,
1583	.get_next_id		= dquot_get_next_id,
1584};
1585
1586static const struct quotactl_ops ext4_qctl_operations = {
1587	.quota_on	= ext4_quota_on,
1588	.quota_off	= ext4_quota_off,
1589	.quota_sync	= dquot_quota_sync,
1590	.get_state	= dquot_get_state,
1591	.set_info	= dquot_set_dqinfo,
1592	.get_dqblk	= dquot_get_dqblk,
1593	.set_dqblk	= dquot_set_dqblk,
1594	.get_nextdqblk	= dquot_get_next_dqblk,
1595};
1596#endif
1597
1598static const struct super_operations ext4_sops = {
1599	.alloc_inode	= ext4_alloc_inode,
1600	.free_inode	= ext4_free_in_core_inode,
1601	.destroy_inode	= ext4_destroy_inode,
1602	.write_inode	= ext4_write_inode,
1603	.dirty_inode	= ext4_dirty_inode,
1604	.drop_inode	= ext4_drop_inode,
1605	.evict_inode	= ext4_evict_inode,
1606	.put_super	= ext4_put_super,
1607	.sync_fs	= ext4_sync_fs,
1608	.freeze_fs	= ext4_freeze,
1609	.unfreeze_fs	= ext4_unfreeze,
1610	.statfs		= ext4_statfs,
1611	.show_options	= ext4_show_options,
1612#ifdef CONFIG_QUOTA
1613	.quota_read	= ext4_quota_read,
1614	.quota_write	= ext4_quota_write,
1615	.get_dquots	= ext4_get_dquots,
1616#endif
1617};
1618
1619static const struct export_operations ext4_export_ops = {
1620	.fh_to_dentry = ext4_fh_to_dentry,
1621	.fh_to_parent = ext4_fh_to_parent,
1622	.get_parent = ext4_get_parent,
1623	.commit_metadata = ext4_nfs_commit_metadata,
1624};
1625
1626enum {
1627	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1628	Opt_resgid, Opt_resuid, Opt_sb,
1629	Opt_nouid32, Opt_debug, Opt_removed,
1630	Opt_user_xattr, Opt_acl,
1631	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1632	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1633	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1634	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1635	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1636	Opt_inlinecrypt,
1637	Opt_usrjquota, Opt_grpjquota, Opt_quota,
1638	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1639	Opt_usrquota, Opt_grpquota, Opt_prjquota,
1640	Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1641	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1642	Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
1643	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1644	Opt_inode_readahead_blks, Opt_journal_ioprio,
1645	Opt_dioread_nolock, Opt_dioread_lock,
1646	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1647	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1648	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
1649	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
1650#ifdef CONFIG_EXT4_DEBUG
1651	Opt_fc_debug_max_replay, Opt_fc_debug_force
1652#endif
1653};
1654
1655static const struct constant_table ext4_param_errors[] = {
1656	{"continue",	EXT4_MOUNT_ERRORS_CONT},
1657	{"panic",	EXT4_MOUNT_ERRORS_PANIC},
1658	{"remount-ro",	EXT4_MOUNT_ERRORS_RO},
1659	{}
1660};
1661
1662static const struct constant_table ext4_param_data[] = {
1663	{"journal",	EXT4_MOUNT_JOURNAL_DATA},
1664	{"ordered",	EXT4_MOUNT_ORDERED_DATA},
1665	{"writeback",	EXT4_MOUNT_WRITEBACK_DATA},
1666	{}
1667};
1668
1669static const struct constant_table ext4_param_data_err[] = {
1670	{"abort",	Opt_data_err_abort},
1671	{"ignore",	Opt_data_err_ignore},
1672	{}
1673};
1674
1675static const struct constant_table ext4_param_jqfmt[] = {
1676	{"vfsold",	QFMT_VFS_OLD},
1677	{"vfsv0",	QFMT_VFS_V0},
1678	{"vfsv1",	QFMT_VFS_V1},
1679	{}
1680};
1681
1682static const struct constant_table ext4_param_dax[] = {
1683	{"always",	Opt_dax_always},
1684	{"inode",	Opt_dax_inode},
1685	{"never",	Opt_dax_never},
1686	{}
1687};
1688
1689/* String parameter that allows empty argument */
1690#define fsparam_string_empty(NAME, OPT) \
1691	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
1692
1693/*
1694 * Mount option specification
1695 * We don't use fsparam_flag_no because of the way we set the
1696 * options and the way we show them in _ext4_show_options(). To
1697 * keep the changes to a minimum, let's keep the negative options
1698 * separate for now.
1699 */
1700static const struct fs_parameter_spec ext4_param_specs[] = {
1701	fsparam_flag	("bsddf",		Opt_bsd_df),
1702	fsparam_flag	("minixdf",		Opt_minix_df),
1703	fsparam_flag	("grpid",		Opt_grpid),
1704	fsparam_flag	("bsdgroups",		Opt_grpid),
1705	fsparam_flag	("nogrpid",		Opt_nogrpid),
1706	fsparam_flag	("sysvgroups",		Opt_nogrpid),
1707	fsparam_u32	("resgid",		Opt_resgid),
1708	fsparam_u32	("resuid",		Opt_resuid),
1709	fsparam_u32	("sb",			Opt_sb),
1710	fsparam_enum	("errors",		Opt_errors, ext4_param_errors),
1711	fsparam_flag	("nouid32",		Opt_nouid32),
1712	fsparam_flag	("debug",		Opt_debug),
1713	fsparam_flag	("oldalloc",		Opt_removed),
1714	fsparam_flag	("orlov",		Opt_removed),
1715	fsparam_flag	("user_xattr",		Opt_user_xattr),
1716	fsparam_flag	("acl",			Opt_acl),
1717	fsparam_flag	("norecovery",		Opt_noload),
1718	fsparam_flag	("noload",		Opt_noload),
1719	fsparam_flag	("bh",			Opt_removed),
1720	fsparam_flag	("nobh",		Opt_removed),
1721	fsparam_u32	("commit",		Opt_commit),
1722	fsparam_u32	("min_batch_time",	Opt_min_batch_time),
1723	fsparam_u32	("max_batch_time",	Opt_max_batch_time),
1724	fsparam_u32	("journal_dev",		Opt_journal_dev),
1725	fsparam_bdev	("journal_path",	Opt_journal_path),
1726	fsparam_flag	("journal_checksum",	Opt_journal_checksum),
1727	fsparam_flag	("nojournal_checksum",	Opt_nojournal_checksum),
1728	fsparam_flag	("journal_async_commit",Opt_journal_async_commit),
1729	fsparam_flag	("abort",		Opt_abort),
1730	fsparam_enum	("data",		Opt_data, ext4_param_data),
1731	fsparam_enum	("data_err",		Opt_data_err,
1732						ext4_param_data_err),
1733	fsparam_string_empty
1734			("usrjquota",		Opt_usrjquota),
1735	fsparam_string_empty
1736			("grpjquota",		Opt_grpjquota),
1737	fsparam_enum	("jqfmt",		Opt_jqfmt, ext4_param_jqfmt),
1738	fsparam_flag	("grpquota",		Opt_grpquota),
1739	fsparam_flag	("quota",		Opt_quota),
1740	fsparam_flag	("noquota",		Opt_noquota),
1741	fsparam_flag	("usrquota",		Opt_usrquota),
1742	fsparam_flag	("prjquota",		Opt_prjquota),
1743	fsparam_flag	("barrier",		Opt_barrier),
1744	fsparam_u32	("barrier",		Opt_barrier),
1745	fsparam_flag	("nobarrier",		Opt_nobarrier),
1746	fsparam_flag	("i_version",		Opt_removed),
1747	fsparam_flag	("dax",			Opt_dax),
1748	fsparam_enum	("dax",			Opt_dax_type, ext4_param_dax),
1749	fsparam_u32	("stripe",		Opt_stripe),
1750	fsparam_flag	("delalloc",		Opt_delalloc),
1751	fsparam_flag	("nodelalloc",		Opt_nodelalloc),
1752	fsparam_flag	("warn_on_error",	Opt_warn_on_error),
1753	fsparam_flag	("nowarn_on_error",	Opt_nowarn_on_error),
1754	fsparam_u32	("debug_want_extra_isize",
1755						Opt_debug_want_extra_isize),
1756	fsparam_flag	("mblk_io_submit",	Opt_removed),
1757	fsparam_flag	("nomblk_io_submit",	Opt_removed),
1758	fsparam_flag	("block_validity",	Opt_block_validity),
1759	fsparam_flag	("noblock_validity",	Opt_noblock_validity),
1760	fsparam_u32	("inode_readahead_blks",
1761						Opt_inode_readahead_blks),
1762	fsparam_u32	("journal_ioprio",	Opt_journal_ioprio),
1763	fsparam_u32	("auto_da_alloc",	Opt_auto_da_alloc),
1764	fsparam_flag	("auto_da_alloc",	Opt_auto_da_alloc),
1765	fsparam_flag	("noauto_da_alloc",	Opt_noauto_da_alloc),
1766	fsparam_flag	("dioread_nolock",	Opt_dioread_nolock),
1767	fsparam_flag	("nodioread_nolock",	Opt_dioread_lock),
1768	fsparam_flag	("dioread_lock",	Opt_dioread_lock),
1769	fsparam_flag	("discard",		Opt_discard),
1770	fsparam_flag	("nodiscard",		Opt_nodiscard),
1771	fsparam_u32	("init_itable",		Opt_init_itable),
1772	fsparam_flag	("init_itable",		Opt_init_itable),
1773	fsparam_flag	("noinit_itable",	Opt_noinit_itable),
1774#ifdef CONFIG_EXT4_DEBUG
1775	fsparam_flag	("fc_debug_force",	Opt_fc_debug_force),
1776	fsparam_u32	("fc_debug_max_replay",	Opt_fc_debug_max_replay),
1777#endif
1778	fsparam_u32	("max_dir_size_kb",	Opt_max_dir_size_kb),
1779	fsparam_flag	("test_dummy_encryption",
1780						Opt_test_dummy_encryption),
1781	fsparam_string	("test_dummy_encryption",
1782						Opt_test_dummy_encryption),
1783	fsparam_flag	("inlinecrypt",		Opt_inlinecrypt),
1784	fsparam_flag	("nombcache",		Opt_nombcache),
1785	fsparam_flag	("no_mbcache",		Opt_nombcache),	/* for backward compatibility */
1786	fsparam_flag	("prefetch_block_bitmaps",
1787						Opt_removed),
1788	fsparam_flag	("no_prefetch_block_bitmaps",
1789						Opt_no_prefetch_block_bitmaps),
1790	fsparam_s32	("mb_optimize_scan",	Opt_mb_optimize_scan),
1791	fsparam_string	("check",		Opt_removed),	/* mount option from ext2/3 */
1792	fsparam_flag	("nocheck",		Opt_removed),	/* mount option from ext2/3 */
1793	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
1794	fsparam_flag	("noreservation",	Opt_removed),	/* mount option from ext2/3 */
1795	fsparam_u32	("journal",		Opt_removed),	/* mount option from ext2/3 */
1796	{}
1797};
1798
1799#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1800
1801#define MOPT_SET	0x0001
1802#define MOPT_CLEAR	0x0002
1803#define MOPT_NOSUPPORT	0x0004
1804#define MOPT_EXPLICIT	0x0008
1805#ifdef CONFIG_QUOTA
1806#define MOPT_Q		0
1807#define MOPT_QFMT	0x0010
1808#else
1809#define MOPT_Q		MOPT_NOSUPPORT
1810#define MOPT_QFMT	MOPT_NOSUPPORT
1811#endif
1812#define MOPT_NO_EXT2	0x0020
1813#define MOPT_NO_EXT3	0x0040
1814#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1815#define MOPT_SKIP	0x0080
1816#define	MOPT_2		0x0100
1817
1818static const struct mount_opts {
1819	int	token;
1820	int	mount_opt;
1821	int	flags;
1822} ext4_mount_opts[] = {
1823	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1824	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1825	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1826	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1827	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1828	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1829	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1830	 MOPT_EXT4_ONLY | MOPT_SET},
1831	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1832	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1833	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1834	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1835	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1836	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1837	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1838	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1839	{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1840	{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1841	{Opt_commit, 0, MOPT_NO_EXT2},
1842	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1843	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1844	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1845	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1846	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1847				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1848	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1849	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1850	{Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
1851	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1852	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1853	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1854	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1855	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1856	{Opt_dax_type, 0, MOPT_EXT4_ONLY},
1857	{Opt_journal_dev, 0, MOPT_NO_EXT2},
1858	{Opt_journal_path, 0, MOPT_NO_EXT2},
1859	{Opt_journal_ioprio, 0, MOPT_NO_EXT2},
1860	{Opt_data, 0, MOPT_NO_EXT2},
1861	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1862#ifdef CONFIG_EXT4_FS_POSIX_ACL
1863	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1864#else
1865	{Opt_acl, 0, MOPT_NOSUPPORT},
1866#endif
1867	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1868	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1869	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1870	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1871							MOPT_SET | MOPT_Q},
1872	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1873							MOPT_SET | MOPT_Q},
1874	{Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1875							MOPT_SET | MOPT_Q},
1876	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1877		       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1878							MOPT_CLEAR | MOPT_Q},
1879	{Opt_usrjquota, 0, MOPT_Q},
1880	{Opt_grpjquota, 0, MOPT_Q},
1881	{Opt_jqfmt, 0, MOPT_QFMT},
1882	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
1883	{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
1884	 MOPT_SET},
1885#ifdef CONFIG_EXT4_DEBUG
1886	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
1887	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
1888#endif
1889	{Opt_err, 0, 0}
1890};
1891
1892#if IS_ENABLED(CONFIG_UNICODE)
1893static const struct ext4_sb_encodings {
1894	__u16 magic;
1895	char *name;
1896	unsigned int version;
1897} ext4_sb_encoding_map[] = {
1898	{EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
1899};
1900
1901static const struct ext4_sb_encodings *
1902ext4_sb_read_encoding(const struct ext4_super_block *es)
1903{
1904	__u16 magic = le16_to_cpu(es->s_encoding);
1905	int i;
1906
1907	for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
1908		if (magic == ext4_sb_encoding_map[i].magic)
1909			return &ext4_sb_encoding_map[i];
1910
1911	return NULL;
1912}
1913#endif
1914
1915#define EXT4_SPEC_JQUOTA			(1 <<  0)
1916#define EXT4_SPEC_JQFMT				(1 <<  1)
1917#define EXT4_SPEC_DATAJ				(1 <<  2)
1918#define EXT4_SPEC_SB_BLOCK			(1 <<  3)
1919#define EXT4_SPEC_JOURNAL_DEV			(1 <<  4)
1920#define EXT4_SPEC_JOURNAL_IOPRIO		(1 <<  5)
1921#define EXT4_SPEC_s_want_extra_isize		(1 <<  7)
1922#define EXT4_SPEC_s_max_batch_time		(1 <<  8)
1923#define EXT4_SPEC_s_min_batch_time		(1 <<  9)
1924#define EXT4_SPEC_s_inode_readahead_blks	(1 << 10)
1925#define EXT4_SPEC_s_li_wait_mult		(1 << 11)
1926#define EXT4_SPEC_s_max_dir_size_kb		(1 << 12)
1927#define EXT4_SPEC_s_stripe			(1 << 13)
1928#define EXT4_SPEC_s_resuid			(1 << 14)
1929#define EXT4_SPEC_s_resgid			(1 << 15)
1930#define EXT4_SPEC_s_commit_interval		(1 << 16)
1931#define EXT4_SPEC_s_fc_debug_max_replay		(1 << 17)
1932#define EXT4_SPEC_s_sb_block			(1 << 18)
1933#define EXT4_SPEC_mb_optimize_scan		(1 << 19)
1934
1935struct ext4_fs_context {
1936	char		*s_qf_names[EXT4_MAXQUOTAS];
1937	struct fscrypt_dummy_policy dummy_enc_policy;
1938	int		s_jquota_fmt;	/* Format of quota to use */
1939#ifdef CONFIG_EXT4_DEBUG
1940	int s_fc_debug_max_replay;
1941#endif
1942	unsigned short	qname_spec;
1943	unsigned long	vals_s_flags;	/* Bits to set in s_flags */
1944	unsigned long	mask_s_flags;	/* Bits changed in s_flags */
1945	unsigned long	journal_devnum;
1946	unsigned long	s_commit_interval;
1947	unsigned long	s_stripe;
1948	unsigned int	s_inode_readahead_blks;
1949	unsigned int	s_want_extra_isize;
1950	unsigned int	s_li_wait_mult;
1951	unsigned int	s_max_dir_size_kb;
1952	unsigned int	journal_ioprio;
1953	unsigned int	vals_s_mount_opt;
1954	unsigned int	mask_s_mount_opt;
1955	unsigned int	vals_s_mount_opt2;
1956	unsigned int	mask_s_mount_opt2;
1957	unsigned long	vals_s_mount_flags;
1958	unsigned long	mask_s_mount_flags;
1959	unsigned int	opt_flags;	/* MOPT flags */
1960	unsigned int	spec;
1961	u32		s_max_batch_time;
1962	u32		s_min_batch_time;
1963	kuid_t		s_resuid;
1964	kgid_t		s_resgid;
1965	ext4_fsblk_t	s_sb_block;
1966};
1967
1968static void ext4_fc_free(struct fs_context *fc)
1969{
1970	struct ext4_fs_context *ctx = fc->fs_private;
1971	int i;
1972
1973	if (!ctx)
1974		return;
1975
1976	for (i = 0; i < EXT4_MAXQUOTAS; i++)
1977		kfree(ctx->s_qf_names[i]);
1978
1979	fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
1980	kfree(ctx);
1981}
1982
1983int ext4_init_fs_context(struct fs_context *fc)
1984{
1985	struct ext4_fs_context *ctx;
1986
1987	ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
1988	if (!ctx)
1989		return -ENOMEM;
1990
1991	fc->fs_private = ctx;
1992	fc->ops = &ext4_context_ops;
1993
1994	return 0;
1995}
1996
1997#ifdef CONFIG_QUOTA
1998/*
1999 * Note the name of the specified quota file.
2000 */
2001static int note_qf_name(struct fs_context *fc, int qtype,
2002		       struct fs_parameter *param)
2003{
2004	struct ext4_fs_context *ctx = fc->fs_private;
2005	char *qname;
2006
2007	if (param->size < 1) {
2008		ext4_msg(NULL, KERN_ERR, "Missing quota name");
2009		return -EINVAL;
2010	}
2011	if (strchr(param->string, '/')) {
2012		ext4_msg(NULL, KERN_ERR,
2013			 "quotafile must be on filesystem root");
2014		return -EINVAL;
2015	}
2016	if (ctx->s_qf_names[qtype]) {
2017		if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
2018			ext4_msg(NULL, KERN_ERR,
2019				 "%s quota file already specified",
2020				 QTYPE2NAME(qtype));
2021			return -EINVAL;
2022		}
2023		return 0;
2024	}
2025
2026	qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
2027	if (!qname) {
2028		ext4_msg(NULL, KERN_ERR,
2029			 "Not enough memory for storing quotafile name");
2030		return -ENOMEM;
2031	}
2032	ctx->s_qf_names[qtype] = qname;
2033	ctx->qname_spec |= 1 << qtype;
2034	ctx->spec |= EXT4_SPEC_JQUOTA;
2035	return 0;
2036}
2037
2038/*
2039 * Clear the name of the specified quota file.
2040 */
2041static int unnote_qf_name(struct fs_context *fc, int qtype)
2042{
2043	struct ext4_fs_context *ctx = fc->fs_private;
2044
2045	if (ctx->s_qf_names[qtype])
2046		kfree(ctx->s_qf_names[qtype]);
2047
2048	ctx->s_qf_names[qtype] = NULL;
2049	ctx->qname_spec |= 1 << qtype;
2050	ctx->spec |= EXT4_SPEC_JQUOTA;
2051	return 0;
2052}
2053#endif
2054
2055static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
2056					    struct ext4_fs_context *ctx)
2057{
2058	int err;
2059
2060	if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
2061		ext4_msg(NULL, KERN_WARNING,
2062			 "test_dummy_encryption option not supported");
2063		return -EINVAL;
2064	}
2065	err = fscrypt_parse_test_dummy_encryption(param,
2066						  &ctx->dummy_enc_policy);
2067	if (err == -EINVAL) {
2068		ext4_msg(NULL, KERN_WARNING,
2069			 "Value of option \"%s\" is unrecognized", param->key);
2070	} else if (err == -EEXIST) {
2071		ext4_msg(NULL, KERN_WARNING,
2072			 "Conflicting test_dummy_encryption options");
2073		return -EINVAL;
2074	}
2075	return err;
2076}
2077
2078#define EXT4_SET_CTX(name)						\
2079static inline void ctx_set_##name(struct ext4_fs_context *ctx,		\
2080				  unsigned long flag)			\
2081{									\
2082	ctx->mask_s_##name |= flag;					\
2083	ctx->vals_s_##name |= flag;					\
2084}
2085
2086#define EXT4_CLEAR_CTX(name)						\
2087static inline void ctx_clear_##name(struct ext4_fs_context *ctx,	\
2088				    unsigned long flag)			\
2089{									\
2090	ctx->mask_s_##name |= flag;					\
2091	ctx->vals_s_##name &= ~flag;					\
2092}
2093
2094#define EXT4_TEST_CTX(name)						\
2095static inline unsigned long						\
2096ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag)	\
2097{									\
2098	return (ctx->vals_s_##name & flag);				\
2099}
2100
2101EXT4_SET_CTX(flags); /* set only */
2102EXT4_SET_CTX(mount_opt);
2103EXT4_CLEAR_CTX(mount_opt);
2104EXT4_TEST_CTX(mount_opt);
2105EXT4_SET_CTX(mount_opt2);
2106EXT4_CLEAR_CTX(mount_opt2);
2107EXT4_TEST_CTX(mount_opt2);
2108
2109static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
2110{
2111	set_bit(bit, &ctx->mask_s_mount_flags);
2112	set_bit(bit, &ctx->vals_s_mount_flags);
2113}
2114
2115static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
2116{
2117	struct ext4_fs_context *ctx = fc->fs_private;
2118	struct fs_parse_result result;
2119	const struct mount_opts *m;
2120	int is_remount;
2121	kuid_t uid;
2122	kgid_t gid;
2123	int token;
2124
2125	token = fs_parse(fc, ext4_param_specs, param, &result);
2126	if (token < 0)
2127		return token;
2128	is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2129
2130	for (m = ext4_mount_opts; m->token != Opt_err; m++)
2131		if (token == m->token)
2132			break;
2133
2134	ctx->opt_flags |= m->flags;
2135
2136	if (m->flags & MOPT_EXPLICIT) {
2137		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
2138			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
2139		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
2140			ctx_set_mount_opt2(ctx,
2141				       EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
2142		} else
2143			return -EINVAL;
2144	}
2145
2146	if (m->flags & MOPT_NOSUPPORT) {
2147		ext4_msg(NULL, KERN_ERR, "%s option not supported",
2148			 param->key);
2149		return 0;
2150	}
2151
2152	switch (token) {
2153#ifdef CONFIG_QUOTA
2154	case Opt_usrjquota:
2155		if (!*param->string)
2156			return unnote_qf_name(fc, USRQUOTA);
2157		else
2158			return note_qf_name(fc, USRQUOTA, param);
2159	case Opt_grpjquota:
2160		if (!*param->string)
2161			return unnote_qf_name(fc, GRPQUOTA);
2162		else
2163			return note_qf_name(fc, GRPQUOTA, param);
2164#endif
2165	case Opt_sb:
2166		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2167			ext4_msg(NULL, KERN_WARNING,
2168				 "Ignoring %s option on remount", param->key);
2169		} else {
2170			ctx->s_sb_block = result.uint_32;
2171			ctx->spec |= EXT4_SPEC_s_sb_block;
2172		}
2173		return 0;
2174	case Opt_removed:
2175		ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
2176			 param->key);
2177		return 0;
2178	case Opt_abort:
2179		ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
2180		return 0;
2181	case Opt_inlinecrypt:
2182#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
2183		ctx_set_flags(ctx, SB_INLINECRYPT);
2184#else
2185		ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
2186#endif
2187		return 0;
2188	case Opt_errors:
2189		ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
2190		ctx_set_mount_opt(ctx, result.uint_32);
2191		return 0;
2192#ifdef CONFIG_QUOTA
2193	case Opt_jqfmt:
2194		ctx->s_jquota_fmt = result.uint_32;
2195		ctx->spec |= EXT4_SPEC_JQFMT;
2196		return 0;
2197#endif
2198	case Opt_data:
2199		ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2200		ctx_set_mount_opt(ctx, result.uint_32);
2201		ctx->spec |= EXT4_SPEC_DATAJ;
2202		return 0;
2203	case Opt_commit:
2204		if (result.uint_32 == 0)
2205			result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
2206		else if (result.uint_32 > INT_MAX / HZ) {
2207			ext4_msg(NULL, KERN_ERR,
2208				 "Invalid commit interval %d, "
2209				 "must be smaller than %d",
2210				 result.uint_32, INT_MAX / HZ);
2211			return -EINVAL;
2212		}
2213		ctx->s_commit_interval = HZ * result.uint_32;
2214		ctx->spec |= EXT4_SPEC_s_commit_interval;
2215		return 0;
2216	case Opt_debug_want_extra_isize:
2217		if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
2218			ext4_msg(NULL, KERN_ERR,
2219				 "Invalid want_extra_isize %d", result.uint_32);
2220			return -EINVAL;
2221		}
2222		ctx->s_want_extra_isize = result.uint_32;
2223		ctx->spec |= EXT4_SPEC_s_want_extra_isize;
2224		return 0;
2225	case Opt_max_batch_time:
2226		ctx->s_max_batch_time = result.uint_32;
2227		ctx->spec |= EXT4_SPEC_s_max_batch_time;
2228		return 0;
2229	case Opt_min_batch_time:
2230		ctx->s_min_batch_time = result.uint_32;
2231		ctx->spec |= EXT4_SPEC_s_min_batch_time;
2232		return 0;
2233	case Opt_inode_readahead_blks:
2234		if (result.uint_32 &&
2235		    (result.uint_32 > (1 << 30) ||
2236		     !is_power_of_2(result.uint_32))) {
2237			ext4_msg(NULL, KERN_ERR,
2238				 "EXT4-fs: inode_readahead_blks must be "
2239				 "0 or a power of 2 smaller than 2^31");
2240			return -EINVAL;
2241		}
2242		ctx->s_inode_readahead_blks = result.uint_32;
2243		ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
2244		return 0;
2245	case Opt_init_itable:
2246		ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
2247		ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
2248		if (param->type == fs_value_is_string)
2249			ctx->s_li_wait_mult = result.uint_32;
2250		ctx->spec |= EXT4_SPEC_s_li_wait_mult;
2251		return 0;
2252	case Opt_max_dir_size_kb:
2253		ctx->s_max_dir_size_kb = result.uint_32;
2254		ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
2255		return 0;
2256#ifdef CONFIG_EXT4_DEBUG
2257	case Opt_fc_debug_max_replay:
2258		ctx->s_fc_debug_max_replay = result.uint_32;
2259		ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
2260		return 0;
2261#endif
2262	case Opt_stripe:
2263		ctx->s_stripe = result.uint_32;
2264		ctx->spec |= EXT4_SPEC_s_stripe;
2265		return 0;
2266	case Opt_resuid:
2267		uid = make_kuid(current_user_ns(), result.uint_32);
2268		if (!uid_valid(uid)) {
2269			ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
2270				 result.uint_32);
2271			return -EINVAL;
2272		}
2273		ctx->s_resuid = uid;
2274		ctx->spec |= EXT4_SPEC_s_resuid;
2275		return 0;
2276	case Opt_resgid:
2277		gid = make_kgid(current_user_ns(), result.uint_32);
2278		if (!gid_valid(gid)) {
2279			ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
2280				 result.uint_32);
2281			return -EINVAL;
2282		}
2283		ctx->s_resgid = gid;
2284		ctx->spec |= EXT4_SPEC_s_resgid;
2285		return 0;
2286	case Opt_journal_dev:
2287		if (is_remount) {
2288			ext4_msg(NULL, KERN_ERR,
2289				 "Cannot specify journal on remount");
2290			return -EINVAL;
2291		}
2292		ctx->journal_devnum = result.uint_32;
2293		ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2294		return 0;
2295	case Opt_journal_path:
2296	{
2297		struct inode *journal_inode;
2298		struct path path;
2299		int error;
2300
2301		if (is_remount) {
2302			ext4_msg(NULL, KERN_ERR,
2303				 "Cannot specify journal on remount");
2304			return -EINVAL;
2305		}
2306
2307		error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
2308		if (error) {
2309			ext4_msg(NULL, KERN_ERR, "error: could not find "
2310				 "journal device path");
2311			return -EINVAL;
2312		}
2313
2314		journal_inode = d_inode(path.dentry);
2315		ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
2316		ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2317		path_put(&path);
2318		return 0;
2319	}
2320	case Opt_journal_ioprio:
2321		if (result.uint_32 > 7) {
2322			ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
2323				 " (must be 0-7)");
2324			return -EINVAL;
2325		}
2326		ctx->journal_ioprio =
2327			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
2328		ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
2329		return 0;
2330	case Opt_test_dummy_encryption:
2331		return ext4_parse_test_dummy_encryption(param, ctx);
2332	case Opt_dax:
2333	case Opt_dax_type:
2334#ifdef CONFIG_FS_DAX
2335	{
2336		int type = (token == Opt_dax) ?
2337			   Opt_dax : result.uint_32;
2338
2339		switch (type) {
2340		case Opt_dax:
2341		case Opt_dax_always:
2342			ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2343			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2344			break;
2345		case Opt_dax_never:
2346			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2347			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2348			break;
2349		case Opt_dax_inode:
2350			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2351			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2352			/* Strictly for printing options */
2353			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
2354			break;
2355		}
2356		return 0;
2357	}
2358#else
2359		ext4_msg(NULL, KERN_INFO, "dax option not supported");
2360		return -EINVAL;
2361#endif
2362	case Opt_data_err:
2363		if (result.uint_32 == Opt_data_err_abort)
2364			ctx_set_mount_opt(ctx, m->mount_opt);
2365		else if (result.uint_32 == Opt_data_err_ignore)
2366			ctx_clear_mount_opt(ctx, m->mount_opt);
2367		return 0;
2368	case Opt_mb_optimize_scan:
2369		if (result.int_32 == 1) {
2370			ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2371			ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2372		} else if (result.int_32 == 0) {
2373			ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2374			ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2375		} else {
2376			ext4_msg(NULL, KERN_WARNING,
2377				 "mb_optimize_scan should be set to 0 or 1.");
2378			return -EINVAL;
2379		}
2380		return 0;
2381	}
2382
2383	/*
2384	 * At this point we should only be getting options requiring MOPT_SET,
2385	 * or MOPT_CLEAR. Anything else is a bug
2386	 */
2387	if (m->token == Opt_err) {
2388		ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
2389			 param->key);
2390		WARN_ON(1);
2391		return -EINVAL;
2392	}
2393
2394	else {
2395		unsigned int set = 0;
2396
2397		if ((param->type == fs_value_is_flag) ||
2398		    result.uint_32 > 0)
2399			set = 1;
2400
2401		if (m->flags & MOPT_CLEAR)
2402			set = !set;
2403		else if (unlikely(!(m->flags & MOPT_SET))) {
2404			ext4_msg(NULL, KERN_WARNING,
2405				 "buggy handling of option %s",
2406				 param->key);
2407			WARN_ON(1);
2408			return -EINVAL;
2409		}
2410		if (m->flags & MOPT_2) {
2411			if (set != 0)
2412				ctx_set_mount_opt2(ctx, m->mount_opt);
2413			else
2414				ctx_clear_mount_opt2(ctx, m->mount_opt);
2415		} else {
2416			if (set != 0)
2417				ctx_set_mount_opt(ctx, m->mount_opt);
2418			else
2419				ctx_clear_mount_opt(ctx, m->mount_opt);
2420		}
2421	}
2422
2423	return 0;
2424}
2425
2426static int parse_options(struct fs_context *fc, char *options)
2427{
2428	struct fs_parameter param;
2429	int ret;
2430	char *key;
2431
2432	if (!options)
2433		return 0;
2434
2435	while ((key = strsep(&options, ",")) != NULL) {
2436		if (*key) {
2437			size_t v_len = 0;
2438			char *value = strchr(key, '=');
2439
2440			param.type = fs_value_is_flag;
2441			param.string = NULL;
2442
2443			if (value) {
2444				if (value == key)
2445					continue;
2446
2447				*value++ = 0;
2448				v_len = strlen(value);
2449				param.string = kmemdup_nul(value, v_len,
2450							   GFP_KERNEL);
2451				if (!param.string)
2452					return -ENOMEM;
2453				param.type = fs_value_is_string;
2454			}
2455
2456			param.key = key;
2457			param.size = v_len;
2458
2459			ret = ext4_parse_param(fc, &param);
2460			if (param.string)
2461				kfree(param.string);
2462			if (ret < 0)
2463				return ret;
2464		}
2465	}
2466
2467	ret = ext4_validate_options(fc);
2468	if (ret < 0)
2469		return ret;
2470
2471	return 0;
2472}
2473
2474static int parse_apply_sb_mount_options(struct super_block *sb,
2475					struct ext4_fs_context *m_ctx)
2476{
2477	struct ext4_sb_info *sbi = EXT4_SB(sb);
2478	char *s_mount_opts = NULL;
2479	struct ext4_fs_context *s_ctx = NULL;
2480	struct fs_context *fc = NULL;
2481	int ret = -ENOMEM;
2482
2483	if (!sbi->s_es->s_mount_opts[0])
2484		return 0;
2485
2486	s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
2487				sizeof(sbi->s_es->s_mount_opts),
2488				GFP_KERNEL);
2489	if (!s_mount_opts)
2490		return ret;
2491
2492	fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
2493	if (!fc)
2494		goto out_free;
2495
2496	s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
2497	if (!s_ctx)
2498		goto out_free;
2499
2500	fc->fs_private = s_ctx;
2501	fc->s_fs_info = sbi;
2502
2503	ret = parse_options(fc, s_mount_opts);
2504	if (ret < 0)
2505		goto parse_failed;
2506
2507	ret = ext4_check_opt_consistency(fc, sb);
2508	if (ret < 0) {
2509parse_failed:
2510		ext4_msg(sb, KERN_WARNING,
2511			 "failed to parse options in superblock: %s",
2512			 s_mount_opts);
2513		ret = 0;
2514		goto out_free;
2515	}
2516
2517	if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
2518		m_ctx->journal_devnum = s_ctx->journal_devnum;
2519	if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
2520		m_ctx->journal_ioprio = s_ctx->journal_ioprio;
2521
2522	ext4_apply_options(fc, sb);
2523	ret = 0;
2524
2525out_free:
2526	if (fc) {
2527		ext4_fc_free(fc);
2528		kfree(fc);
2529	}
2530	kfree(s_mount_opts);
2531	return ret;
2532}
2533
2534static void ext4_apply_quota_options(struct fs_context *fc,
2535				     struct super_block *sb)
2536{
2537#ifdef CONFIG_QUOTA
2538	bool quota_feature = ext4_has_feature_quota(sb);
2539	struct ext4_fs_context *ctx = fc->fs_private;
2540	struct ext4_sb_info *sbi = EXT4_SB(sb);
2541	char *qname;
2542	int i;
2543
2544	if (quota_feature)
2545		return;
2546
2547	if (ctx->spec & EXT4_SPEC_JQUOTA) {
2548		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2549			if (!(ctx->qname_spec & (1 << i)))
2550				continue;
2551
2552			qname = ctx->s_qf_names[i]; /* May be NULL */
2553			if (qname)
2554				set_opt(sb, QUOTA);
2555			ctx->s_qf_names[i] = NULL;
2556			qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
2557						lockdep_is_held(&sb->s_umount));
2558			if (qname)
2559				kfree_rcu_mightsleep(qname);
2560		}
2561	}
2562
2563	if (ctx->spec & EXT4_SPEC_JQFMT)
2564		sbi->s_jquota_fmt = ctx->s_jquota_fmt;
2565#endif
2566}
2567
2568/*
2569 * Check quota settings consistency.
2570 */
2571static int ext4_check_quota_consistency(struct fs_context *fc,
2572					struct super_block *sb)
2573{
2574#ifdef CONFIG_QUOTA
2575	struct ext4_fs_context *ctx = fc->fs_private;
2576	struct ext4_sb_info *sbi = EXT4_SB(sb);
2577	bool quota_feature = ext4_has_feature_quota(sb);
2578	bool quota_loaded = sb_any_quota_loaded(sb);
2579	bool usr_qf_name, grp_qf_name, usrquota, grpquota;
2580	int quota_flags, i;
2581
2582	/*
2583	 * We do the test below only for project quotas. 'usrquota' and
2584	 * 'grpquota' mount options are allowed even without quota feature
2585	 * to support legacy quotas in quota files.
2586	 */
2587	if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
2588	    !ext4_has_feature_project(sb)) {
2589		ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
2590			 "Cannot enable project quota enforcement.");
2591		return -EINVAL;
2592	}
2593
2594	quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
2595		      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
2596	if (quota_loaded &&
2597	    ctx->mask_s_mount_opt & quota_flags &&
2598	    !ctx_test_mount_opt(ctx, quota_flags))
2599		goto err_quota_change;
2600
2601	if (ctx->spec & EXT4_SPEC_JQUOTA) {
2602
2603		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2604			if (!(ctx->qname_spec & (1 << i)))
2605				continue;
2606
2607			if (quota_loaded &&
2608			    !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
2609				goto err_jquota_change;
2610
2611			if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
2612			    strcmp(get_qf_name(sb, sbi, i),
2613				   ctx->s_qf_names[i]) != 0)
2614				goto err_jquota_specified;
2615		}
2616
2617		if (quota_feature) {
2618			ext4_msg(NULL, KERN_INFO,
2619				 "Journaled quota options ignored when "
2620				 "QUOTA feature is enabled");
2621			return 0;
2622		}
2623	}
2624
2625	if (ctx->spec & EXT4_SPEC_JQFMT) {
2626		if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
2627			goto err_jquota_change;
2628		if (quota_feature) {
2629			ext4_msg(NULL, KERN_INFO, "Quota format mount options "
2630				 "ignored when QUOTA feature is enabled");
2631			return 0;
2632		}
2633	}
2634
2635	/* Make sure we don't mix old and new quota format */
2636	usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
2637		       ctx->s_qf_names[USRQUOTA]);
2638	grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
2639		       ctx->s_qf_names[GRPQUOTA]);
2640
2641	usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2642		    test_opt(sb, USRQUOTA));
2643
2644	grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
2645		    test_opt(sb, GRPQUOTA));
2646
2647	if (usr_qf_name) {
2648		ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2649		usrquota = false;
2650	}
2651	if (grp_qf_name) {
2652		ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2653		grpquota = false;
2654	}
2655
2656	if (usr_qf_name || grp_qf_name) {
2657		if (usrquota || grpquota) {
2658			ext4_msg(NULL, KERN_ERR, "old and new quota "
2659				 "format mixing");
2660			return -EINVAL;
2661		}
2662
2663		if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
2664			ext4_msg(NULL, KERN_ERR, "journaled quota format "
2665				 "not specified");
2666			return -EINVAL;
2667		}
2668	}
2669
2670	return 0;
2671
2672err_quota_change:
2673	ext4_msg(NULL, KERN_ERR,
2674		 "Cannot change quota options when quota turned on");
2675	return -EINVAL;
2676err_jquota_change:
2677	ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
2678		 "options when quota turned on");
2679	return -EINVAL;
2680err_jquota_specified:
2681	ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
2682		 QTYPE2NAME(i));
2683	return -EINVAL;
2684#else
2685	return 0;
2686#endif
2687}
2688
2689static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
2690					    struct super_block *sb)
2691{
2692	const struct ext4_fs_context *ctx = fc->fs_private;
2693	const struct ext4_sb_info *sbi = EXT4_SB(sb);
2694
2695	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
2696		return 0;
2697
2698	if (!ext4_has_feature_encrypt(sb)) {
2699		ext4_msg(NULL, KERN_WARNING,
2700			 "test_dummy_encryption requires encrypt feature");
2701		return -EINVAL;
2702	}
2703	/*
2704	 * This mount option is just for testing, and it's not worthwhile to
2705	 * implement the extra complexity (e.g. RCU protection) that would be
2706	 * needed to allow it to be set or changed during remount.  We do allow
2707	 * it to be specified during remount, but only if there is no change.
2708	 */
2709	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2710		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2711						 &ctx->dummy_enc_policy))
2712			return 0;
2713		ext4_msg(NULL, KERN_WARNING,
2714			 "Can't set or change test_dummy_encryption on remount");
2715		return -EINVAL;
2716	}
2717	/* Also make sure s_mount_opts didn't contain a conflicting value. */
2718	if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
2719		if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2720						 &ctx->dummy_enc_policy))
2721			return 0;
2722		ext4_msg(NULL, KERN_WARNING,
2723			 "Conflicting test_dummy_encryption options");
2724		return -EINVAL;
2725	}
2726	return 0;
2727}
2728
2729static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
2730					     struct super_block *sb)
2731{
2732	if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
2733	    /* if already set, it was already verified to be the same */
2734	    fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
2735		return;
2736	EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
2737	memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
2738	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
2739}
2740
2741static int ext4_check_opt_consistency(struct fs_context *fc,
2742				      struct super_block *sb)
2743{
2744	struct ext4_fs_context *ctx = fc->fs_private;
2745	struct ext4_sb_info *sbi = fc->s_fs_info;
2746	int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2747	int err;
2748
2749	if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
2750		ext4_msg(NULL, KERN_ERR,
2751			 "Mount option(s) incompatible with ext2");
2752		return -EINVAL;
2753	}
2754	if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
2755		ext4_msg(NULL, KERN_ERR,
2756			 "Mount option(s) incompatible with ext3");
2757		return -EINVAL;
2758	}
2759
2760	if (ctx->s_want_extra_isize >
2761	    (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
2762		ext4_msg(NULL, KERN_ERR,
2763			 "Invalid want_extra_isize %d",
2764			 ctx->s_want_extra_isize);
2765		return -EINVAL;
2766	}
2767
2768	if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
2769		int blocksize =
2770			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2771		if (blocksize < PAGE_SIZE)
2772			ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
2773				 "experimental mount option 'dioread_nolock' "
2774				 "for blocksize < PAGE_SIZE");
2775	}
2776
2777	err = ext4_check_test_dummy_encryption(fc, sb);
2778	if (err)
2779		return err;
2780
2781	if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
2782		if (!sbi->s_journal) {
2783			ext4_msg(NULL, KERN_WARNING,
2784				 "Remounting file system with no journal "
2785				 "so ignoring journalled data option");
2786			ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2787		} else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
2788			   test_opt(sb, DATA_FLAGS)) {
2789			ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
2790				 "on remount");
2791			return -EINVAL;
2792		}
2793	}
2794
2795	if (is_remount) {
2796		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2797		    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2798			ext4_msg(NULL, KERN_ERR, "can't mount with "
2799				 "both data=journal and dax");
2800			return -EINVAL;
2801		}
2802
2803		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2804		    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2805		     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2806fail_dax_change_remount:
2807			ext4_msg(NULL, KERN_ERR, "can't change "
2808				 "dax mount option while remounting");
2809			return -EINVAL;
2810		} else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
2811			 (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2812			  (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
2813			goto fail_dax_change_remount;
2814		} else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
2815			   ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2816			    (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2817			    !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
2818			goto fail_dax_change_remount;
2819		}
2820	}
2821
2822	return ext4_check_quota_consistency(fc, sb);
2823}
2824
2825static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
2826{
2827	struct ext4_fs_context *ctx = fc->fs_private;
2828	struct ext4_sb_info *sbi = fc->s_fs_info;
2829
2830	sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
2831	sbi->s_mount_opt |= ctx->vals_s_mount_opt;
2832	sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
2833	sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
2834	sbi->s_mount_flags &= ~ctx->mask_s_mount_flags;
2835	sbi->s_mount_flags |= ctx->vals_s_mount_flags;
2836	sb->s_flags &= ~ctx->mask_s_flags;
2837	sb->s_flags |= ctx->vals_s_flags;
2838
2839#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
2840	APPLY(s_commit_interval);
2841	APPLY(s_stripe);
2842	APPLY(s_max_batch_time);
2843	APPLY(s_min_batch_time);
2844	APPLY(s_want_extra_isize);
2845	APPLY(s_inode_readahead_blks);
2846	APPLY(s_max_dir_size_kb);
2847	APPLY(s_li_wait_mult);
2848	APPLY(s_resgid);
2849	APPLY(s_resuid);
2850
2851#ifdef CONFIG_EXT4_DEBUG
2852	APPLY(s_fc_debug_max_replay);
2853#endif
2854
2855	ext4_apply_quota_options(fc, sb);
2856	ext4_apply_test_dummy_encryption(ctx, sb);
2857}
2858
2859
2860static int ext4_validate_options(struct fs_context *fc)
2861{
2862#ifdef CONFIG_QUOTA
2863	struct ext4_fs_context *ctx = fc->fs_private;
2864	char *usr_qf_name, *grp_qf_name;
2865
2866	usr_qf_name = ctx->s_qf_names[USRQUOTA];
2867	grp_qf_name = ctx->s_qf_names[GRPQUOTA];
2868
2869	if (usr_qf_name || grp_qf_name) {
2870		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
2871			ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2872
2873		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
2874			ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2875
2876		if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2877		    ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
2878			ext4_msg(NULL, KERN_ERR, "old and new quota "
2879				 "format mixing");
2880			return -EINVAL;
2881		}
2882	}
2883#endif
2884	return 1;
2885}
2886
2887static inline void ext4_show_quota_options(struct seq_file *seq,
2888					   struct super_block *sb)
2889{
2890#if defined(CONFIG_QUOTA)
2891	struct ext4_sb_info *sbi = EXT4_SB(sb);
2892	char *usr_qf_name, *grp_qf_name;
2893
2894	if (sbi->s_jquota_fmt) {
2895		char *fmtname = "";
2896
2897		switch (sbi->s_jquota_fmt) {
2898		case QFMT_VFS_OLD:
2899			fmtname = "vfsold";
2900			break;
2901		case QFMT_VFS_V0:
2902			fmtname = "vfsv0";
2903			break;
2904		case QFMT_VFS_V1:
2905			fmtname = "vfsv1";
2906			break;
2907		}
2908		seq_printf(seq, ",jqfmt=%s", fmtname);
2909	}
2910
2911	rcu_read_lock();
2912	usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2913	grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2914	if (usr_qf_name)
2915		seq_show_option(seq, "usrjquota", usr_qf_name);
2916	if (grp_qf_name)
2917		seq_show_option(seq, "grpjquota", grp_qf_name);
2918	rcu_read_unlock();
2919#endif
2920}
2921
2922static const char *token2str(int token)
2923{
2924	const struct fs_parameter_spec *spec;
2925
2926	for (spec = ext4_param_specs; spec->name != NULL; spec++)
2927		if (spec->opt == token && !spec->type)
2928			break;
2929	return spec->name;
2930}
2931
2932/*
2933 * Show an option if
2934 *  - it's set to a non-default value OR
2935 *  - if the per-sb default is different from the global default
2936 */
2937static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2938			      int nodefs)
2939{
2940	struct ext4_sb_info *sbi = EXT4_SB(sb);
2941	struct ext4_super_block *es = sbi->s_es;
2942	int def_errors;
2943	const struct mount_opts *m;
2944	char sep = nodefs ? '\n' : ',';
2945
2946#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2947#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2948
2949	if (sbi->s_sb_block != 1)
2950		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2951
2952	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2953		int want_set = m->flags & MOPT_SET;
2954		int opt_2 = m->flags & MOPT_2;
2955		unsigned int mount_opt, def_mount_opt;
2956
2957		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2958		    m->flags & MOPT_SKIP)
2959			continue;
2960
2961		if (opt_2) {
2962			mount_opt = sbi->s_mount_opt2;
2963			def_mount_opt = sbi->s_def_mount_opt2;
2964		} else {
2965			mount_opt = sbi->s_mount_opt;
2966			def_mount_opt = sbi->s_def_mount_opt;
2967		}
2968		/* skip if same as the default */
2969		if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
2970			continue;
2971		/* select Opt_noFoo vs Opt_Foo */
2972		if ((want_set &&
2973		     (mount_opt & m->mount_opt) != m->mount_opt) ||
2974		    (!want_set && (mount_opt & m->mount_opt)))
2975			continue;
2976		SEQ_OPTS_PRINT("%s", token2str(m->token));
2977	}
2978
2979	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2980	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2981		SEQ_OPTS_PRINT("resuid=%u",
2982				from_kuid_munged(&init_user_ns, sbi->s_resuid));
2983	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
2984	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
2985		SEQ_OPTS_PRINT("resgid=%u",
2986				from_kgid_munged(&init_user_ns, sbi->s_resgid));
2987	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
2988	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
2989		SEQ_OPTS_PUTS("errors=remount-ro");
2990	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
2991		SEQ_OPTS_PUTS("errors=continue");
2992	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
2993		SEQ_OPTS_PUTS("errors=panic");
2994	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
2995		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
2996	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
2997		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
2998	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
2999		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
3000	if (nodefs || sbi->s_stripe)
3001		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
3002	if (nodefs || EXT4_MOUNT_DATA_FLAGS &
3003			(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
3004		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
3005			SEQ_OPTS_PUTS("data=journal");
3006		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
3007			SEQ_OPTS_PUTS("data=ordered");
3008		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
3009			SEQ_OPTS_PUTS("data=writeback");
3010	}
3011	if (nodefs ||
3012	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
3013		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
3014			       sbi->s_inode_readahead_blks);
3015
3016	if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
3017		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
3018		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
3019	if (nodefs || sbi->s_max_dir_size_kb)
3020		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
3021	if (test_opt(sb, DATA_ERR_ABORT))
3022		SEQ_OPTS_PUTS("data_err=abort");
3023
3024	fscrypt_show_test_dummy_encryption(seq, sep, sb);
3025
3026	if (sb->s_flags & SB_INLINECRYPT)
3027		SEQ_OPTS_PUTS("inlinecrypt");
3028
3029	if (test_opt(sb, DAX_ALWAYS)) {
3030		if (IS_EXT2_SB(sb))
3031			SEQ_OPTS_PUTS("dax");
3032		else
3033			SEQ_OPTS_PUTS("dax=always");
3034	} else if (test_opt2(sb, DAX_NEVER)) {
3035		SEQ_OPTS_PUTS("dax=never");
3036	} else if (test_opt2(sb, DAX_INODE)) {
3037		SEQ_OPTS_PUTS("dax=inode");
3038	}
3039
3040	if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3041			!test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3042		SEQ_OPTS_PUTS("mb_optimize_scan=0");
3043	} else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3044			test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3045		SEQ_OPTS_PUTS("mb_optimize_scan=1");
3046	}
3047
3048	ext4_show_quota_options(seq, sb);
3049	return 0;
3050}
3051
3052static int ext4_show_options(struct seq_file *seq, struct dentry *root)
3053{
3054	return _ext4_show_options(seq, root->d_sb, 0);
3055}
3056
3057int ext4_seq_options_show(struct seq_file *seq, void *offset)
3058{
3059	struct super_block *sb = seq->private;
3060	int rc;
3061
3062	seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
3063	rc = _ext4_show_options(seq, sb, 1);
3064	seq_puts(seq, "\n");
3065	return rc;
3066}
3067
3068static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
3069			    int read_only)
3070{
3071	struct ext4_sb_info *sbi = EXT4_SB(sb);
3072	int err = 0;
3073
3074	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
3075		ext4_msg(sb, KERN_ERR, "revision level too high, "
3076			 "forcing read-only mode");
3077		err = -EROFS;
3078		goto done;
3079	}
3080	if (read_only)
3081		goto done;
3082	if (!(sbi->s_mount_state & EXT4_VALID_FS))
3083		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
3084			 "running e2fsck is recommended");
3085	else if (sbi->s_mount_state & EXT4_ERROR_FS)
3086		ext4_msg(sb, KERN_WARNING,
3087			 "warning: mounting fs with errors, "
3088			 "running e2fsck is recommended");
3089	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
3090		 le16_to_cpu(es->s_mnt_count) >=
3091		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
3092		ext4_msg(sb, KERN_WARNING,
3093			 "warning: maximal mount count reached, "
3094			 "running e2fsck is recommended");
3095	else if (le32_to_cpu(es->s_checkinterval) &&
3096		 (ext4_get_tstamp(es, s_lastcheck) +
3097		  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
3098		ext4_msg(sb, KERN_WARNING,
3099			 "warning: checktime reached, "
3100			 "running e2fsck is recommended");
3101	if (!sbi->s_journal)
3102		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
3103	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
3104		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
3105	le16_add_cpu(&es->s_mnt_count, 1);
3106	ext4_update_tstamp(es, s_mtime);
3107	if (sbi->s_journal) {
3108		ext4_set_feature_journal_needs_recovery(sb);
3109		if (ext4_has_feature_orphan_file(sb))
3110			ext4_set_feature_orphan_present(sb);
3111	}
3112
3113	err = ext4_commit_super(sb);
3114done:
3115	if (test_opt(sb, DEBUG))
3116		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
3117				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
3118			sb->s_blocksize,
3119			sbi->s_groups_count,
3120			EXT4_BLOCKS_PER_GROUP(sb),
3121			EXT4_INODES_PER_GROUP(sb),
3122			sbi->s_mount_opt, sbi->s_mount_opt2);
3123	return err;
3124}
3125
3126int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
3127{
3128	struct ext4_sb_info *sbi = EXT4_SB(sb);
3129	struct flex_groups **old_groups, **new_groups;
3130	int size, i, j;
3131
3132	if (!sbi->s_log_groups_per_flex)
3133		return 0;
3134
3135	size = ext4_flex_group(sbi, ngroup - 1) + 1;
3136	if (size <= sbi->s_flex_groups_allocated)
3137		return 0;
3138
3139	new_groups = kvzalloc(roundup_pow_of_two(size *
3140			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
3141	if (!new_groups) {
3142		ext4_msg(sb, KERN_ERR,
3143			 "not enough memory for %d flex group pointers", size);
3144		return -ENOMEM;
3145	}
3146	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
3147		new_groups[i] = kvzalloc(roundup_pow_of_two(
3148					 sizeof(struct flex_groups)),
3149					 GFP_KERNEL);
3150		if (!new_groups[i]) {
3151			for (j = sbi->s_flex_groups_allocated; j < i; j++)
3152				kvfree(new_groups[j]);
3153			kvfree(new_groups);
3154			ext4_msg(sb, KERN_ERR,
3155				 "not enough memory for %d flex groups", size);
3156			return -ENOMEM;
3157		}
3158	}
3159	rcu_read_lock();
3160	old_groups = rcu_dereference(sbi->s_flex_groups);
3161	if (old_groups)
3162		memcpy(new_groups, old_groups,
3163		       (sbi->s_flex_groups_allocated *
3164			sizeof(struct flex_groups *)));
3165	rcu_read_unlock();
3166	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
3167	sbi->s_flex_groups_allocated = size;
3168	if (old_groups)
3169		ext4_kvfree_array_rcu(old_groups);
3170	return 0;
3171}
3172
3173static int ext4_fill_flex_info(struct super_block *sb)
3174{
3175	struct ext4_sb_info *sbi = EXT4_SB(sb);
3176	struct ext4_group_desc *gdp = NULL;
3177	struct flex_groups *fg;
3178	ext4_group_t flex_group;
3179	int i, err;
3180
3181	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
3182	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
3183		sbi->s_log_groups_per_flex = 0;
3184		return 1;
3185	}
3186
3187	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
3188	if (err)
3189		goto failed;
3190
3191	for (i = 0; i < sbi->s_groups_count; i++) {
3192		gdp = ext4_get_group_desc(sb, i, NULL);
3193
3194		flex_group = ext4_flex_group(sbi, i);
3195		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
3196		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
3197		atomic64_add(ext4_free_group_clusters(sb, gdp),
3198			     &fg->free_clusters);
3199		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
3200	}
3201
3202	return 1;
3203failed:
3204	return 0;
3205}
3206
3207static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
3208				   struct ext4_group_desc *gdp)
3209{
3210	int offset = offsetof(struct ext4_group_desc, bg_checksum);
3211	__u16 crc = 0;
3212	__le32 le_group = cpu_to_le32(block_group);
3213	struct ext4_sb_info *sbi = EXT4_SB(sb);
3214
3215	if (ext4_has_metadata_csum(sbi->s_sb)) {
3216		/* Use new metadata_csum algorithm */
3217		__u32 csum32;
3218		__u16 dummy_csum = 0;
3219
3220		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
3221				     sizeof(le_group));
3222		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
3223		csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
3224				     sizeof(dummy_csum));
3225		offset += sizeof(dummy_csum);
3226		if (offset < sbi->s_desc_size)
3227			csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
3228					     sbi->s_desc_size - offset);
3229
3230		crc = csum32 & 0xFFFF;
3231		goto out;
3232	}
3233
3234	/* old crc16 code */
3235	if (!ext4_has_feature_gdt_csum(sb))
3236		return 0;
3237
3238	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
3239	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
3240	crc = crc16(crc, (__u8 *)gdp, offset);
3241	offset += sizeof(gdp->bg_checksum); /* skip checksum */
3242	/* for checksum of struct ext4_group_desc do the rest...*/
3243	if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
3244		crc = crc16(crc, (__u8 *)gdp + offset,
3245			    sbi->s_desc_size - offset);
3246
3247out:
3248	return cpu_to_le16(crc);
3249}
3250
3251int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
3252				struct ext4_group_desc *gdp)
3253{
3254	if (ext4_has_group_desc_csum(sb) &&
3255	    (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
3256		return 0;
3257
3258	return 1;
3259}
3260
3261void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
3262			      struct ext4_group_desc *gdp)
3263{
3264	if (!ext4_has_group_desc_csum(sb))
3265		return;
3266	gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
3267}
3268
3269/* Called at mount-time, super-block is locked */
3270static int ext4_check_descriptors(struct super_block *sb,
3271				  ext4_fsblk_t sb_block,
3272				  ext4_group_t *first_not_zeroed)
3273{
3274	struct ext4_sb_info *sbi = EXT4_SB(sb);
3275	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
3276	ext4_fsblk_t last_block;
3277	ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
3278	ext4_fsblk_t block_bitmap;
3279	ext4_fsblk_t inode_bitmap;
3280	ext4_fsblk_t inode_table;
3281	int flexbg_flag = 0;
3282	ext4_group_t i, grp = sbi->s_groups_count;
3283
3284	if (ext4_has_feature_flex_bg(sb))
3285		flexbg_flag = 1;
3286
3287	ext4_debug("Checking group descriptors");
3288
3289	for (i = 0; i < sbi->s_groups_count; i++) {
3290		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
3291
3292		if (i == sbi->s_groups_count - 1 || flexbg_flag)
3293			last_block = ext4_blocks_count(sbi->s_es) - 1;
3294		else
3295			last_block = first_block +
3296				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
3297
3298		if ((grp == sbi->s_groups_count) &&
3299		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3300			grp = i;
3301
3302		block_bitmap = ext4_block_bitmap(sb, gdp);
3303		if (block_bitmap == sb_block) {
3304			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3305				 "Block bitmap for group %u overlaps "
3306				 "superblock", i);
3307			if (!sb_rdonly(sb))
3308				return 0;
3309		}
3310		if (block_bitmap >= sb_block + 1 &&
3311		    block_bitmap <= last_bg_block) {
3312			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3313				 "Block bitmap for group %u overlaps "
3314				 "block group descriptors", i);
3315			if (!sb_rdonly(sb))
3316				return 0;
3317		}
3318		if (block_bitmap < first_block || block_bitmap > last_block) {
3319			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3320			       "Block bitmap for group %u not in group "
3321			       "(block %llu)!", i, block_bitmap);
3322			return 0;
3323		}
3324		inode_bitmap = ext4_inode_bitmap(sb, gdp);
3325		if (inode_bitmap == sb_block) {
3326			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3327				 "Inode bitmap for group %u overlaps "
3328				 "superblock", i);
3329			if (!sb_rdonly(sb))
3330				return 0;
3331		}
3332		if (inode_bitmap >= sb_block + 1 &&
3333		    inode_bitmap <= last_bg_block) {
3334			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3335				 "Inode bitmap for group %u overlaps "
3336				 "block group descriptors", i);
3337			if (!sb_rdonly(sb))
3338				return 0;
3339		}
3340		if (inode_bitmap < first_block || inode_bitmap > last_block) {
3341			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3342			       "Inode bitmap for group %u not in group "
3343			       "(block %llu)!", i, inode_bitmap);
3344			return 0;
3345		}
3346		inode_table = ext4_inode_table(sb, gdp);
3347		if (inode_table == sb_block) {
3348			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3349				 "Inode table for group %u overlaps "
3350				 "superblock", i);
3351			if (!sb_rdonly(sb))
3352				return 0;
3353		}
3354		if (inode_table >= sb_block + 1 &&
3355		    inode_table <= last_bg_block) {
3356			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3357				 "Inode table for group %u overlaps "
3358				 "block group descriptors", i);
3359			if (!sb_rdonly(sb))
3360				return 0;
3361		}
3362		if (inode_table < first_block ||
3363		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
3364			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3365			       "Inode table for group %u not in group "
3366			       "(block %llu)!", i, inode_table);
3367			return 0;
3368		}
3369		ext4_lock_group(sb, i);
3370		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
3371			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3372				 "Checksum for group %u failed (%u!=%u)",
3373				 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
3374				     gdp)), le16_to_cpu(gdp->bg_checksum));
3375			if (!sb_rdonly(sb)) {
3376				ext4_unlock_group(sb, i);
3377				return 0;
3378			}
3379		}
3380		ext4_unlock_group(sb, i);
3381		if (!flexbg_flag)
3382			first_block += EXT4_BLOCKS_PER_GROUP(sb);
3383	}
3384	if (NULL != first_not_zeroed)
3385		*first_not_zeroed = grp;
3386	return 1;
3387}
3388
3389/*
3390 * Maximal extent format file size.
3391 * Resulting logical blkno at s_maxbytes must fit in our on-disk
3392 * extent format containers, within a sector_t, and within i_blocks
3393 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
3394 * so that won't be a limiting factor.
3395 *
3396 * However there is other limiting factor. We do store extents in the form
3397 * of starting block and length, hence the resulting length of the extent
3398 * covering maximum file size must fit into on-disk format containers as
3399 * well. Given that length is always by 1 unit bigger than max unit (because
3400 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
3401 *
3402 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
3403 */
3404static loff_t ext4_max_size(int blkbits, int has_huge_files)
3405{
3406	loff_t res;
3407	loff_t upper_limit = MAX_LFS_FILESIZE;
3408
3409	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
3410
3411	if (!has_huge_files) {
3412		upper_limit = (1LL << 32) - 1;
3413
3414		/* total blocks in file system block size */
3415		upper_limit >>= (blkbits - 9);
3416		upper_limit <<= blkbits;
3417	}
3418
3419	/*
3420	 * 32-bit extent-start container, ee_block. We lower the maxbytes
3421	 * by one fs block, so ee_len can cover the extent of maximum file
3422	 * size
3423	 */
3424	res = (1LL << 32) - 1;
3425	res <<= blkbits;
3426
3427	/* Sanity check against vm- & vfs- imposed limits */
3428	if (res > upper_limit)
3429		res = upper_limit;
3430
3431	return res;
3432}
3433
3434/*
3435 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
3436 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
3437 * We need to be 1 filesystem block less than the 2^48 sector limit.
3438 */
3439static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
3440{
3441	loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
3442	int meta_blocks;
3443	unsigned int ppb = 1 << (bits - 2);
3444
3445	/*
3446	 * This is calculated to be the largest file size for a dense, block
3447	 * mapped file such that the file's total number of 512-byte sectors,
3448	 * including data and all indirect blocks, does not exceed (2^48 - 1).
3449	 *
3450	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
3451	 * number of 512-byte sectors of the file.
3452	 */
3453	if (!has_huge_files) {
3454		/*
3455		 * !has_huge_files or implies that the inode i_block field
3456		 * represents total file blocks in 2^32 512-byte sectors ==
3457		 * size of vfs inode i_blocks * 8
3458		 */
3459		upper_limit = (1LL << 32) - 1;
3460
3461		/* total blocks in file system block size */
3462		upper_limit >>= (bits - 9);
3463
3464	} else {
3465		/*
3466		 * We use 48 bit ext4_inode i_blocks
3467		 * With EXT4_HUGE_FILE_FL set the i_blocks
3468		 * represent total number of blocks in
3469		 * file system block size
3470		 */
3471		upper_limit = (1LL << 48) - 1;
3472
3473	}
3474
3475	/* Compute how many blocks we can address by block tree */
3476	res += ppb;
3477	res += ppb * ppb;
3478	res += ((loff_t)ppb) * ppb * ppb;
3479	/* Compute how many metadata blocks are needed */
3480	meta_blocks = 1;
3481	meta_blocks += 1 + ppb;
3482	meta_blocks += 1 + ppb + ppb * ppb;
3483	/* Does block tree limit file size? */
3484	if (res + meta_blocks <= upper_limit)
3485		goto check_lfs;
3486
3487	res = upper_limit;
3488	/* How many metadata blocks are needed for addressing upper_limit? */
3489	upper_limit -= EXT4_NDIR_BLOCKS;
3490	/* indirect blocks */
3491	meta_blocks = 1;
3492	upper_limit -= ppb;
3493	/* double indirect blocks */
3494	if (upper_limit < ppb * ppb) {
3495		meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
3496		res -= meta_blocks;
3497		goto check_lfs;
3498	}
3499	meta_blocks += 1 + ppb;
3500	upper_limit -= ppb * ppb;
3501	/* tripple indirect blocks for the rest */
3502	meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
3503		DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
3504	res -= meta_blocks;
3505check_lfs:
3506	res <<= bits;
3507	if (res > MAX_LFS_FILESIZE)
3508		res = MAX_LFS_FILESIZE;
3509
3510	return res;
3511}
3512
3513static ext4_fsblk_t descriptor_loc(struct super_block *sb,
3514				   ext4_fsblk_t logical_sb_block, int nr)
3515{
3516	struct ext4_sb_info *sbi = EXT4_SB(sb);
3517	ext4_group_t bg, first_meta_bg;
3518	int has_super = 0;
3519
3520	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3521
3522	if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3523		return logical_sb_block + nr + 1;
3524	bg = sbi->s_desc_per_block * nr;
3525	if (ext4_bg_has_super(sb, bg))
3526		has_super = 1;
3527
3528	/*
3529	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3530	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
3531	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3532	 * compensate.
3533	 */
3534	if (sb->s_blocksize == 1024 && nr == 0 &&
3535	    le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3536		has_super++;
3537
3538	return (has_super + ext4_group_first_block_no(sb, bg));
3539}
3540
3541/**
3542 * ext4_get_stripe_size: Get the stripe size.
3543 * @sbi: In memory super block info
3544 *
3545 * If we have specified it via mount option, then
3546 * use the mount option value. If the value specified at mount time is
3547 * greater than the blocks per group use the super block value.
3548 * If the super block value is greater than blocks per group return 0.
3549 * Allocator needs it be less than blocks per group.
3550 *
3551 */
3552static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3553{
3554	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3555	unsigned long stripe_width =
3556			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3557	int ret;
3558
3559	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3560		ret = sbi->s_stripe;
3561	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3562		ret = stripe_width;
3563	else if (stride && stride <= sbi->s_blocks_per_group)
3564		ret = stride;
3565	else
3566		ret = 0;
3567
3568	/*
3569	 * If the stripe width is 1, this makes no sense and
3570	 * we set it to 0 to turn off stripe handling code.
3571	 */
3572	if (ret <= 1)
3573		ret = 0;
3574
3575	return ret;
3576}
3577
3578/*
3579 * Check whether this filesystem can be mounted based on
3580 * the features present and the RDONLY/RDWR mount requested.
3581 * Returns 1 if this filesystem can be mounted as requested,
3582 * 0 if it cannot be.
3583 */
3584int ext4_feature_set_ok(struct super_block *sb, int readonly)
3585{
3586	if (ext4_has_unknown_ext4_incompat_features(sb)) {
3587		ext4_msg(sb, KERN_ERR,
3588			"Couldn't mount because of "
3589			"unsupported optional features (%x)",
3590			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3591			~EXT4_FEATURE_INCOMPAT_SUPP));
3592		return 0;
3593	}
3594
3595#if !IS_ENABLED(CONFIG_UNICODE)
3596	if (ext4_has_feature_casefold(sb)) {
3597		ext4_msg(sb, KERN_ERR,
3598			 "Filesystem with casefold feature cannot be "
3599			 "mounted without CONFIG_UNICODE");
3600		return 0;
3601	}
3602#endif
3603
3604	if (readonly)
3605		return 1;
3606
3607	if (ext4_has_feature_readonly(sb)) {
3608		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3609		sb->s_flags |= SB_RDONLY;
3610		return 1;
3611	}
3612
3613	/* Check that feature set is OK for a read-write mount */
3614	if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3615		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3616			 "unsupported optional features (%x)",
3617			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3618				~EXT4_FEATURE_RO_COMPAT_SUPP));
3619		return 0;
3620	}
3621	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3622		ext4_msg(sb, KERN_ERR,
3623			 "Can't support bigalloc feature without "
3624			 "extents feature\n");
3625		return 0;
3626	}
3627
3628#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3629	if (!readonly && (ext4_has_feature_quota(sb) ||
3630			  ext4_has_feature_project(sb))) {
3631		ext4_msg(sb, KERN_ERR,
3632			 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3633		return 0;
3634	}
3635#endif  /* CONFIG_QUOTA */
3636	return 1;
3637}
3638
3639/*
3640 * This function is called once a day if we have errors logged
3641 * on the file system
3642 */
3643static void print_daily_error_info(struct timer_list *t)
3644{
3645	struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
3646	struct super_block *sb = sbi->s_sb;
3647	struct ext4_super_block *es = sbi->s_es;
3648
3649	if (es->s_error_count)
3650		/* fsck newer than v1.41.13 is needed to clean this condition. */
3651		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3652			 le32_to_cpu(es->s_error_count));
3653	if (es->s_first_error_time) {
3654		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3655		       sb->s_id,
3656		       ext4_get_tstamp(es, s_first_error_time),
3657		       (int) sizeof(es->s_first_error_func),
3658		       es->s_first_error_func,
3659		       le32_to_cpu(es->s_first_error_line));
3660		if (es->s_first_error_ino)
3661			printk(KERN_CONT ": inode %u",
3662			       le32_to_cpu(es->s_first_error_ino));
3663		if (es->s_first_error_block)
3664			printk(KERN_CONT ": block %llu", (unsigned long long)
3665			       le64_to_cpu(es->s_first_error_block));
3666		printk(KERN_CONT "\n");
3667	}
3668	if (es->s_last_error_time) {
3669		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3670		       sb->s_id,
3671		       ext4_get_tstamp(es, s_last_error_time),
3672		       (int) sizeof(es->s_last_error_func),
3673		       es->s_last_error_func,
3674		       le32_to_cpu(es->s_last_error_line));
3675		if (es->s_last_error_ino)
3676			printk(KERN_CONT ": inode %u",
3677			       le32_to_cpu(es->s_last_error_ino));
3678		if (es->s_last_error_block)
3679			printk(KERN_CONT ": block %llu", (unsigned long long)
3680			       le64_to_cpu(es->s_last_error_block));
3681		printk(KERN_CONT "\n");
3682	}
3683	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
3684}
3685
3686/* Find next suitable group and run ext4_init_inode_table */
3687static int ext4_run_li_request(struct ext4_li_request *elr)
3688{
3689	struct ext4_group_desc *gdp = NULL;
3690	struct super_block *sb = elr->lr_super;
3691	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3692	ext4_group_t group = elr->lr_next_group;
3693	unsigned int prefetch_ios = 0;
3694	int ret = 0;
3695	u64 start_time;
3696
3697	if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
3698		elr->lr_next_group = ext4_mb_prefetch(sb, group,
3699				EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
3700		if (prefetch_ios)
3701			ext4_mb_prefetch_fini(sb, elr->lr_next_group,
3702					      prefetch_ios);
3703		trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
3704					    prefetch_ios);
3705		if (group >= elr->lr_next_group) {
3706			ret = 1;
3707			if (elr->lr_first_not_zeroed != ngroups &&
3708			    !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
3709				elr->lr_next_group = elr->lr_first_not_zeroed;
3710				elr->lr_mode = EXT4_LI_MODE_ITABLE;
3711				ret = 0;
3712			}
3713		}
3714		return ret;
3715	}
3716
3717	for (; group < ngroups; group++) {
3718		gdp = ext4_get_group_desc(sb, group, NULL);
3719		if (!gdp) {
3720			ret = 1;
3721			break;
3722		}
3723
3724		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3725			break;
3726	}
3727
3728	if (group >= ngroups)
3729		ret = 1;
3730
3731	if (!ret) {
3732		start_time = ktime_get_real_ns();
3733		ret = ext4_init_inode_table(sb, group,
3734					    elr->lr_timeout ? 0 : 1);
3735		trace_ext4_lazy_itable_init(sb, group);
3736		if (elr->lr_timeout == 0) {
3737			elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
3738				EXT4_SB(elr->lr_super)->s_li_wait_mult);
3739		}
3740		elr->lr_next_sched = jiffies + elr->lr_timeout;
3741		elr->lr_next_group = group + 1;
3742	}
3743	return ret;
3744}
3745
3746/*
3747 * Remove lr_request from the list_request and free the
3748 * request structure. Should be called with li_list_mtx held
3749 */
3750static void ext4_remove_li_request(struct ext4_li_request *elr)
3751{
3752	if (!elr)
3753		return;
3754
3755	list_del(&elr->lr_request);
3756	EXT4_SB(elr->lr_super)->s_li_request = NULL;
3757	kfree(elr);
3758}
3759
3760static void ext4_unregister_li_request(struct super_block *sb)
3761{
3762	mutex_lock(&ext4_li_mtx);
3763	if (!ext4_li_info) {
3764		mutex_unlock(&ext4_li_mtx);
3765		return;
3766	}
3767
3768	mutex_lock(&ext4_li_info->li_list_mtx);
3769	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3770	mutex_unlock(&ext4_li_info->li_list_mtx);
3771	mutex_unlock(&ext4_li_mtx);
3772}
3773
3774static struct task_struct *ext4_lazyinit_task;
3775
3776/*
3777 * This is the function where ext4lazyinit thread lives. It walks
3778 * through the request list searching for next scheduled filesystem.
3779 * When such a fs is found, run the lazy initialization request
3780 * (ext4_rn_li_request) and keep track of the time spend in this
3781 * function. Based on that time we compute next schedule time of
3782 * the request. When walking through the list is complete, compute
3783 * next waking time and put itself into sleep.
3784 */
3785static int ext4_lazyinit_thread(void *arg)
3786{
3787	struct ext4_lazy_init *eli = arg;
3788	struct list_head *pos, *n;
3789	struct ext4_li_request *elr;
3790	unsigned long next_wakeup, cur;
3791
3792	BUG_ON(NULL == eli);
3793	set_freezable();
3794
3795cont_thread:
3796	while (true) {
3797		next_wakeup = MAX_JIFFY_OFFSET;
3798
3799		mutex_lock(&eli->li_list_mtx);
3800		if (list_empty(&eli->li_request_list)) {
3801			mutex_unlock(&eli->li_list_mtx);
3802			goto exit_thread;
3803		}
3804		list_for_each_safe(pos, n, &eli->li_request_list) {
3805			int err = 0;
3806			int progress = 0;
3807			elr = list_entry(pos, struct ext4_li_request,
3808					 lr_request);
3809
3810			if (time_before(jiffies, elr->lr_next_sched)) {
3811				if (time_before(elr->lr_next_sched, next_wakeup))
3812					next_wakeup = elr->lr_next_sched;
3813				continue;
3814			}
3815			if (down_read_trylock(&elr->lr_super->s_umount)) {
3816				if (sb_start_write_trylock(elr->lr_super)) {
3817					progress = 1;
3818					/*
3819					 * We hold sb->s_umount, sb can not
3820					 * be removed from the list, it is
3821					 * now safe to drop li_list_mtx
3822					 */
3823					mutex_unlock(&eli->li_list_mtx);
3824					err = ext4_run_li_request(elr);
3825					sb_end_write(elr->lr_super);
3826					mutex_lock(&eli->li_list_mtx);
3827					n = pos->next;
3828				}
3829				up_read((&elr->lr_super->s_umount));
3830			}
3831			/* error, remove the lazy_init job */
3832			if (err) {
3833				ext4_remove_li_request(elr);
3834				continue;
3835			}
3836			if (!progress) {
3837				elr->lr_next_sched = jiffies +
3838					get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
3839			}
3840			if (time_before(elr->lr_next_sched, next_wakeup))
3841				next_wakeup = elr->lr_next_sched;
3842		}
3843		mutex_unlock(&eli->li_list_mtx);
3844
3845		try_to_freeze();
3846
3847		cur = jiffies;
3848		if ((time_after_eq(cur, next_wakeup)) ||
3849		    (MAX_JIFFY_OFFSET == next_wakeup)) {
3850			cond_resched();
3851			continue;
3852		}
3853
3854		schedule_timeout_interruptible(next_wakeup - cur);
3855
3856		if (kthread_should_stop()) {
3857			ext4_clear_request_list();
3858			goto exit_thread;
3859		}
3860	}
3861
3862exit_thread:
3863	/*
3864	 * It looks like the request list is empty, but we need
3865	 * to check it under the li_list_mtx lock, to prevent any
3866	 * additions into it, and of course we should lock ext4_li_mtx
3867	 * to atomically free the list and ext4_li_info, because at
3868	 * this point another ext4 filesystem could be registering
3869	 * new one.
3870	 */
3871	mutex_lock(&ext4_li_mtx);
3872	mutex_lock(&eli->li_list_mtx);
3873	if (!list_empty(&eli->li_request_list)) {
3874		mutex_unlock(&eli->li_list_mtx);
3875		mutex_unlock(&ext4_li_mtx);
3876		goto cont_thread;
3877	}
3878	mutex_unlock(&eli->li_list_mtx);
3879	kfree(ext4_li_info);
3880	ext4_li_info = NULL;
3881	mutex_unlock(&ext4_li_mtx);
3882
3883	return 0;
3884}
3885
3886static void ext4_clear_request_list(void)
3887{
3888	struct list_head *pos, *n;
3889	struct ext4_li_request *elr;
3890
3891	mutex_lock(&ext4_li_info->li_list_mtx);
3892	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3893		elr = list_entry(pos, struct ext4_li_request,
3894				 lr_request);
3895		ext4_remove_li_request(elr);
3896	}
3897	mutex_unlock(&ext4_li_info->li_list_mtx);
3898}
3899
3900static int ext4_run_lazyinit_thread(void)
3901{
3902	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3903					 ext4_li_info, "ext4lazyinit");
3904	if (IS_ERR(ext4_lazyinit_task)) {
3905		int err = PTR_ERR(ext4_lazyinit_task);
3906		ext4_clear_request_list();
3907		kfree(ext4_li_info);
3908		ext4_li_info = NULL;
3909		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3910				 "initialization thread\n",
3911				 err);
3912		return err;
3913	}
3914	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3915	return 0;
3916}
3917
3918/*
3919 * Check whether it make sense to run itable init. thread or not.
3920 * If there is at least one uninitialized inode table, return
3921 * corresponding group number, else the loop goes through all
3922 * groups and return total number of groups.
3923 */
3924static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3925{
3926	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3927	struct ext4_group_desc *gdp = NULL;
3928
3929	if (!ext4_has_group_desc_csum(sb))
3930		return ngroups;
3931
3932	for (group = 0; group < ngroups; group++) {
3933		gdp = ext4_get_group_desc(sb, group, NULL);
3934		if (!gdp)
3935			continue;
3936
3937		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3938			break;
3939	}
3940
3941	return group;
3942}
3943
3944static int ext4_li_info_new(void)
3945{
3946	struct ext4_lazy_init *eli = NULL;
3947
3948	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3949	if (!eli)
3950		return -ENOMEM;
3951
3952	INIT_LIST_HEAD(&eli->li_request_list);
3953	mutex_init(&eli->li_list_mtx);
3954
3955	eli->li_state |= EXT4_LAZYINIT_QUIT;
3956
3957	ext4_li_info = eli;
3958
3959	return 0;
3960}
3961
3962static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3963					    ext4_group_t start)
3964{
3965	struct ext4_li_request *elr;
3966
3967	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3968	if (!elr)
3969		return NULL;
3970
3971	elr->lr_super = sb;
3972	elr->lr_first_not_zeroed = start;
3973	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
3974		elr->lr_mode = EXT4_LI_MODE_ITABLE;
3975		elr->lr_next_group = start;
3976	} else {
3977		elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
3978	}
3979
3980	/*
3981	 * Randomize first schedule time of the request to
3982	 * spread the inode table initialization requests
3983	 * better.
3984	 */
3985	elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
3986	return elr;
3987}
3988
3989int ext4_register_li_request(struct super_block *sb,
3990			     ext4_group_t first_not_zeroed)
3991{
3992	struct ext4_sb_info *sbi = EXT4_SB(sb);
3993	struct ext4_li_request *elr = NULL;
3994	ext4_group_t ngroups = sbi->s_groups_count;
3995	int ret = 0;
3996
3997	mutex_lock(&ext4_li_mtx);
3998	if (sbi->s_li_request != NULL) {
3999		/*
4000		 * Reset timeout so it can be computed again, because
4001		 * s_li_wait_mult might have changed.
4002		 */
4003		sbi->s_li_request->lr_timeout = 0;
4004		goto out;
4005	}
4006
4007	if (sb_rdonly(sb) ||
4008	    (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
4009	     (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
4010		goto out;
4011
4012	elr = ext4_li_request_new(sb, first_not_zeroed);
4013	if (!elr) {
4014		ret = -ENOMEM;
4015		goto out;
4016	}
4017
4018	if (NULL == ext4_li_info) {
4019		ret = ext4_li_info_new();
4020		if (ret)
4021			goto out;
4022	}
4023
4024	mutex_lock(&ext4_li_info->li_list_mtx);
4025	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
4026	mutex_unlock(&ext4_li_info->li_list_mtx);
4027
4028	sbi->s_li_request = elr;
4029	/*
4030	 * set elr to NULL here since it has been inserted to
4031	 * the request_list and the removal and free of it is
4032	 * handled by ext4_clear_request_list from now on.
4033	 */
4034	elr = NULL;
4035
4036	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
4037		ret = ext4_run_lazyinit_thread();
4038		if (ret)
4039			goto out;
4040	}
4041out:
4042	mutex_unlock(&ext4_li_mtx);
4043	if (ret)
4044		kfree(elr);
4045	return ret;
4046}
4047
4048/*
4049 * We do not need to lock anything since this is called on
4050 * module unload.
4051 */
4052static void ext4_destroy_lazyinit_thread(void)
4053{
4054	/*
4055	 * If thread exited earlier
4056	 * there's nothing to be done.
4057	 */
4058	if (!ext4_li_info || !ext4_lazyinit_task)
4059		return;
4060
4061	kthread_stop(ext4_lazyinit_task);
4062}
4063
4064static int set_journal_csum_feature_set(struct super_block *sb)
4065{
4066	int ret = 1;
4067	int compat, incompat;
4068	struct ext4_sb_info *sbi = EXT4_SB(sb);
4069
4070	if (ext4_has_metadata_csum(sb)) {
4071		/* journal checksum v3 */
4072		compat = 0;
4073		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
4074	} else {
4075		/* journal checksum v1 */
4076		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
4077		incompat = 0;
4078	}
4079
4080	jbd2_journal_clear_features(sbi->s_journal,
4081			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
4082			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
4083			JBD2_FEATURE_INCOMPAT_CSUM_V2);
4084	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4085		ret = jbd2_journal_set_features(sbi->s_journal,
4086				compat, 0,
4087				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
4088				incompat);
4089	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
4090		ret = jbd2_journal_set_features(sbi->s_journal,
4091				compat, 0,
4092				incompat);
4093		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4094				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4095	} else {
4096		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4097				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4098	}
4099
4100	return ret;
4101}
4102
4103/*
4104 * Note: calculating the overhead so we can be compatible with
4105 * historical BSD practice is quite difficult in the face of
4106 * clusters/bigalloc.  This is because multiple metadata blocks from
4107 * different block group can end up in the same allocation cluster.
4108 * Calculating the exact overhead in the face of clustered allocation
4109 * requires either O(all block bitmaps) in memory or O(number of block
4110 * groups**2) in time.  We will still calculate the superblock for
4111 * older file systems --- and if we come across with a bigalloc file
4112 * system with zero in s_overhead_clusters the estimate will be close to
4113 * correct especially for very large cluster sizes --- but for newer
4114 * file systems, it's better to calculate this figure once at mkfs
4115 * time, and store it in the superblock.  If the superblock value is
4116 * present (even for non-bigalloc file systems), we will use it.
4117 */
4118static int count_overhead(struct super_block *sb, ext4_group_t grp,
4119			  char *buf)
4120{
4121	struct ext4_sb_info	*sbi = EXT4_SB(sb);
4122	struct ext4_group_desc	*gdp;
4123	ext4_fsblk_t		first_block, last_block, b;
4124	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
4125	int			s, j, count = 0;
4126	int			has_super = ext4_bg_has_super(sb, grp);
4127
4128	if (!ext4_has_feature_bigalloc(sb))
4129		return (has_super + ext4_bg_num_gdb(sb, grp) +
4130			(has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
4131			sbi->s_itb_per_group + 2);
4132
4133	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
4134		(grp * EXT4_BLOCKS_PER_GROUP(sb));
4135	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
4136	for (i = 0; i < ngroups; i++) {
4137		gdp = ext4_get_group_desc(sb, i, NULL);
4138		b = ext4_block_bitmap(sb, gdp);
4139		if (b >= first_block && b <= last_block) {
4140			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4141			count++;
4142		}
4143		b = ext4_inode_bitmap(sb, gdp);
4144		if (b >= first_block && b <= last_block) {
4145			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4146			count++;
4147		}
4148		b = ext4_inode_table(sb, gdp);
4149		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
4150			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
4151				int c = EXT4_B2C(sbi, b - first_block);
4152				ext4_set_bit(c, buf);
4153				count++;
4154			}
4155		if (i != grp)
4156			continue;
4157		s = 0;
4158		if (ext4_bg_has_super(sb, grp)) {
4159			ext4_set_bit(s++, buf);
4160			count++;
4161		}
4162		j = ext4_bg_num_gdb(sb, grp);
4163		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
4164			ext4_error(sb, "Invalid number of block group "
4165				   "descriptor blocks: %d", j);
4166			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
4167		}
4168		count += j;
4169		for (; j > 0; j--)
4170			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
4171	}
4172	if (!count)
4173		return 0;
4174	return EXT4_CLUSTERS_PER_GROUP(sb) -
4175		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
4176}
4177
4178/*
4179 * Compute the overhead and stash it in sbi->s_overhead
4180 */
4181int ext4_calculate_overhead(struct super_block *sb)
4182{
4183	struct ext4_sb_info *sbi = EXT4_SB(sb);
4184	struct ext4_super_block *es = sbi->s_es;
4185	struct inode *j_inode;
4186	unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
4187	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4188	ext4_fsblk_t overhead = 0;
4189	char *buf = (char *) get_zeroed_page(GFP_NOFS);
4190
4191	if (!buf)
4192		return -ENOMEM;
4193
4194	/*
4195	 * Compute the overhead (FS structures).  This is constant
4196	 * for a given filesystem unless the number of block groups
4197	 * changes so we cache the previous value until it does.
4198	 */
4199
4200	/*
4201	 * All of the blocks before first_data_block are overhead
4202	 */
4203	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
4204
4205	/*
4206	 * Add the overhead found in each block group
4207	 */
4208	for (i = 0; i < ngroups; i++) {
4209		int blks;
4210
4211		blks = count_overhead(sb, i, buf);
4212		overhead += blks;
4213		if (blks)
4214			memset(buf, 0, PAGE_SIZE);
4215		cond_resched();
4216	}
4217
4218	/*
4219	 * Add the internal journal blocks whether the journal has been
4220	 * loaded or not
4221	 */
4222	if (sbi->s_journal && !sbi->s_journal_bdev)
4223		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
4224	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
4225		/* j_inum for internal journal is non-zero */
4226		j_inode = ext4_get_journal_inode(sb, j_inum);
4227		if (j_inode) {
4228			j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
4229			overhead += EXT4_NUM_B2C(sbi, j_blocks);
4230			iput(j_inode);
4231		} else {
4232			ext4_msg(sb, KERN_ERR, "can't get journal size");
4233		}
4234	}
4235	sbi->s_overhead = overhead;
4236	smp_wmb();
4237	free_page((unsigned long) buf);
4238	return 0;
4239}
4240
4241static void ext4_set_resv_clusters(struct super_block *sb)
4242{
4243	ext4_fsblk_t resv_clusters;
4244	struct ext4_sb_info *sbi = EXT4_SB(sb);
4245
4246	/*
4247	 * There's no need to reserve anything when we aren't using extents.
4248	 * The space estimates are exact, there are no unwritten extents,
4249	 * hole punching doesn't need new metadata... This is needed especially
4250	 * to keep ext2/3 backward compatibility.
4251	 */
4252	if (!ext4_has_feature_extents(sb))
4253		return;
4254	/*
4255	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
4256	 * This should cover the situations where we can not afford to run
4257	 * out of space like for example punch hole, or converting
4258	 * unwritten extents in delalloc path. In most cases such
4259	 * allocation would require 1, or 2 blocks, higher numbers are
4260	 * very rare.
4261	 */
4262	resv_clusters = (ext4_blocks_count(sbi->s_es) >>
4263			 sbi->s_cluster_bits);
4264
4265	do_div(resv_clusters, 50);
4266	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
4267
4268	atomic64_set(&sbi->s_resv_clusters, resv_clusters);
4269}
4270
4271static const char *ext4_quota_mode(struct super_block *sb)
4272{
4273#ifdef CONFIG_QUOTA
4274	if (!ext4_quota_capable(sb))
4275		return "none";
4276
4277	if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
4278		return "journalled";
4279	else
4280		return "writeback";
4281#else
4282	return "disabled";
4283#endif
4284}
4285
4286static void ext4_setup_csum_trigger(struct super_block *sb,
4287				    enum ext4_journal_trigger_type type,
4288				    void (*trigger)(
4289					struct jbd2_buffer_trigger_type *type,
4290					struct buffer_head *bh,
4291					void *mapped_data,
4292					size_t size))
4293{
4294	struct ext4_sb_info *sbi = EXT4_SB(sb);
4295
4296	sbi->s_journal_triggers[type].sb = sb;
4297	sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
4298}
4299
4300static void ext4_free_sbi(struct ext4_sb_info *sbi)
4301{
4302	if (!sbi)
4303		return;
4304
4305	kfree(sbi->s_blockgroup_lock);
4306	fs_put_dax(sbi->s_daxdev, NULL);
4307	kfree(sbi);
4308}
4309
4310static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
4311{
4312	struct ext4_sb_info *sbi;
4313
4314	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
4315	if (!sbi)
4316		return NULL;
4317
4318	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
4319					   NULL, NULL);
4320
4321	sbi->s_blockgroup_lock =
4322		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
4323
4324	if (!sbi->s_blockgroup_lock)
4325		goto err_out;
4326
4327	sb->s_fs_info = sbi;
4328	sbi->s_sb = sb;
4329	return sbi;
4330err_out:
4331	fs_put_dax(sbi->s_daxdev, NULL);
4332	kfree(sbi);
4333	return NULL;
4334}
4335
4336static void ext4_set_def_opts(struct super_block *sb,
4337			      struct ext4_super_block *es)
4338{
4339	unsigned long def_mount_opts;
4340
4341	/* Set defaults before we parse the mount options */
4342	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
4343	set_opt(sb, INIT_INODE_TABLE);
4344	if (def_mount_opts & EXT4_DEFM_DEBUG)
4345		set_opt(sb, DEBUG);
4346	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
4347		set_opt(sb, GRPID);
4348	if (def_mount_opts & EXT4_DEFM_UID16)
4349		set_opt(sb, NO_UID32);
4350	/* xattr user namespace & acls are now defaulted on */
4351	set_opt(sb, XATTR_USER);
4352#ifdef CONFIG_EXT4_FS_POSIX_ACL
4353	set_opt(sb, POSIX_ACL);
4354#endif
4355	if (ext4_has_feature_fast_commit(sb))
4356		set_opt2(sb, JOURNAL_FAST_COMMIT);
4357	/* don't forget to enable journal_csum when metadata_csum is enabled. */
4358	if (ext4_has_metadata_csum(sb))
4359		set_opt(sb, JOURNAL_CHECKSUM);
4360
4361	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
4362		set_opt(sb, JOURNAL_DATA);
4363	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
4364		set_opt(sb, ORDERED_DATA);
4365	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
4366		set_opt(sb, WRITEBACK_DATA);
4367
4368	if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
4369		set_opt(sb, ERRORS_PANIC);
4370	else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
4371		set_opt(sb, ERRORS_CONT);
4372	else
4373		set_opt(sb, ERRORS_RO);
4374	/* block_validity enabled by default; disable with noblock_validity */
4375	set_opt(sb, BLOCK_VALIDITY);
4376	if (def_mount_opts & EXT4_DEFM_DISCARD)
4377		set_opt(sb, DISCARD);
4378
4379	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
4380		set_opt(sb, BARRIER);
4381
4382	/*
4383	 * enable delayed allocation by default
4384	 * Use -o nodelalloc to turn it off
4385	 */
4386	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
4387	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
4388		set_opt(sb, DELALLOC);
4389
4390	if (sb->s_blocksize == PAGE_SIZE)
4391		set_opt(sb, DIOREAD_NOLOCK);
4392}
4393
4394static int ext4_handle_clustersize(struct super_block *sb)
4395{
4396	struct ext4_sb_info *sbi = EXT4_SB(sb);
4397	struct ext4_super_block *es = sbi->s_es;
4398	int clustersize;
4399
4400	/* Handle clustersize */
4401	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4402	if (ext4_has_feature_bigalloc(sb)) {
4403		if (clustersize < sb->s_blocksize) {
4404			ext4_msg(sb, KERN_ERR,
4405				 "cluster size (%d) smaller than "
4406				 "block size (%lu)", clustersize, sb->s_blocksize);
4407			return -EINVAL;
4408		}
4409		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4410			le32_to_cpu(es->s_log_block_size);
4411		sbi->s_clusters_per_group =
4412			le32_to_cpu(es->s_clusters_per_group);
4413		if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
4414			ext4_msg(sb, KERN_ERR,
4415				 "#clusters per group too big: %lu",
4416				 sbi->s_clusters_per_group);
4417			return -EINVAL;
4418		}
4419		if (sbi->s_blocks_per_group !=
4420		    (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
4421			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4422				 "clusters per group (%lu) inconsistent",
4423				 sbi->s_blocks_per_group,
4424				 sbi->s_clusters_per_group);
4425			return -EINVAL;
4426		}
4427	} else {
4428		if (clustersize != sb->s_blocksize) {
4429			ext4_msg(sb, KERN_ERR,
4430				 "fragment/cluster size (%d) != "
4431				 "block size (%lu)", clustersize, sb->s_blocksize);
4432			return -EINVAL;
4433		}
4434		if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
4435			ext4_msg(sb, KERN_ERR,
4436				 "#blocks per group too big: %lu",
4437				 sbi->s_blocks_per_group);
4438			return -EINVAL;
4439		}
4440		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4441		sbi->s_cluster_bits = 0;
4442	}
4443	sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
4444
4445	/* Do we have standard group size of clustersize * 8 blocks ? */
4446	if (sbi->s_blocks_per_group == clustersize << 3)
4447		set_opt2(sb, STD_GROUP_SIZE);
4448
4449	return 0;
4450}
4451
4452static void ext4_fast_commit_init(struct super_block *sb)
4453{
4454	struct ext4_sb_info *sbi = EXT4_SB(sb);
4455
4456	/* Initialize fast commit stuff */
4457	atomic_set(&sbi->s_fc_subtid, 0);
4458	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
4459	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
4460	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
4461	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
4462	sbi->s_fc_bytes = 0;
4463	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
4464	sbi->s_fc_ineligible_tid = 0;
4465	spin_lock_init(&sbi->s_fc_lock);
4466	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
4467	sbi->s_fc_replay_state.fc_regions = NULL;
4468	sbi->s_fc_replay_state.fc_regions_size = 0;
4469	sbi->s_fc_replay_state.fc_regions_used = 0;
4470	sbi->s_fc_replay_state.fc_regions_valid = 0;
4471	sbi->s_fc_replay_state.fc_modified_inodes = NULL;
4472	sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
4473	sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
4474}
4475
4476static int ext4_inode_info_init(struct super_block *sb,
4477				struct ext4_super_block *es)
4478{
4479	struct ext4_sb_info *sbi = EXT4_SB(sb);
4480
4481	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
4482		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
4483		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
4484	} else {
4485		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
4486		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
4487		if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
4488			ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
4489				 sbi->s_first_ino);
4490			return -EINVAL;
4491		}
4492		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
4493		    (!is_power_of_2(sbi->s_inode_size)) ||
4494		    (sbi->s_inode_size > sb->s_blocksize)) {
4495			ext4_msg(sb, KERN_ERR,
4496			       "unsupported inode size: %d",
4497			       sbi->s_inode_size);
4498			ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
4499			return -EINVAL;
4500		}
4501		/*
4502		 * i_atime_extra is the last extra field available for
4503		 * [acm]times in struct ext4_inode. Checking for that
4504		 * field should suffice to ensure we have extra space
4505		 * for all three.
4506		 */
4507		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
4508			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
4509			sb->s_time_gran = 1;
4510			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
4511		} else {
4512			sb->s_time_gran = NSEC_PER_SEC;
4513			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
4514		}
4515		sb->s_time_min = EXT4_TIMESTAMP_MIN;
4516	}
4517
4518	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4519		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4520			EXT4_GOOD_OLD_INODE_SIZE;
4521		if (ext4_has_feature_extra_isize(sb)) {
4522			unsigned v, max = (sbi->s_inode_size -
4523					   EXT4_GOOD_OLD_INODE_SIZE);
4524
4525			v = le16_to_cpu(es->s_want_extra_isize);
4526			if (v > max) {
4527				ext4_msg(sb, KERN_ERR,
4528					 "bad s_want_extra_isize: %d", v);
4529				return -EINVAL;
4530			}
4531			if (sbi->s_want_extra_isize < v)
4532				sbi->s_want_extra_isize = v;
4533
4534			v = le16_to_cpu(es->s_min_extra_isize);
4535			if (v > max) {
4536				ext4_msg(sb, KERN_ERR,
4537					 "bad s_min_extra_isize: %d", v);
4538				return -EINVAL;
4539			}
4540			if (sbi->s_want_extra_isize < v)
4541				sbi->s_want_extra_isize = v;
4542		}
4543	}
4544
4545	return 0;
4546}
4547
4548#if IS_ENABLED(CONFIG_UNICODE)
4549static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4550{
4551	const struct ext4_sb_encodings *encoding_info;
4552	struct unicode_map *encoding;
4553	__u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
4554
4555	if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
4556		return 0;
4557
4558	encoding_info = ext4_sb_read_encoding(es);
4559	if (!encoding_info) {
4560		ext4_msg(sb, KERN_ERR,
4561			"Encoding requested by superblock is unknown");
4562		return -EINVAL;
4563	}
4564
4565	encoding = utf8_load(encoding_info->version);
4566	if (IS_ERR(encoding)) {
4567		ext4_msg(sb, KERN_ERR,
4568			"can't mount with superblock charset: %s-%u.%u.%u "
4569			"not supported by the kernel. flags: 0x%x.",
4570			encoding_info->name,
4571			unicode_major(encoding_info->version),
4572			unicode_minor(encoding_info->version),
4573			unicode_rev(encoding_info->version),
4574			encoding_flags);
4575		return -EINVAL;
4576	}
4577	ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4578		"%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
4579		unicode_major(encoding_info->version),
4580		unicode_minor(encoding_info->version),
4581		unicode_rev(encoding_info->version),
4582		encoding_flags);
4583
4584	sb->s_encoding = encoding;
4585	sb->s_encoding_flags = encoding_flags;
4586
4587	return 0;
4588}
4589#else
4590static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4591{
4592	return 0;
4593}
4594#endif
4595
4596static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
4597{
4598	struct ext4_sb_info *sbi = EXT4_SB(sb);
4599
4600	/* Warn if metadata_csum and gdt_csum are both set. */
4601	if (ext4_has_feature_metadata_csum(sb) &&
4602	    ext4_has_feature_gdt_csum(sb))
4603		ext4_warning(sb, "metadata_csum and uninit_bg are "
4604			     "redundant flags; please run fsck.");
4605
4606	/* Check for a known checksum algorithm */
4607	if (!ext4_verify_csum_type(sb, es)) {
4608		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4609			 "unknown checksum algorithm.");
4610		return -EINVAL;
4611	}
4612	ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
4613				ext4_orphan_file_block_trigger);
4614
4615	/* Load the checksum driver */
4616	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
4617	if (IS_ERR(sbi->s_chksum_driver)) {
4618		int ret = PTR_ERR(sbi->s_chksum_driver);
4619		ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
4620		sbi->s_chksum_driver = NULL;
4621		return ret;
4622	}
4623
4624	/* Check superblock checksum */
4625	if (!ext4_superblock_csum_verify(sb, es)) {
4626		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4627			 "invalid superblock checksum.  Run e2fsck?");
4628		return -EFSBADCRC;
4629	}
4630
4631	/* Precompute checksum seed for all metadata */
4632	if (ext4_has_feature_csum_seed(sb))
4633		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
4634	else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
4635		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
4636					       sizeof(es->s_uuid));
4637	return 0;
4638}
4639
4640static int ext4_check_feature_compatibility(struct super_block *sb,
4641					    struct ext4_super_block *es,
4642					    int silent)
4643{
4644	struct ext4_sb_info *sbi = EXT4_SB(sb);
4645
4646	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4647	    (ext4_has_compat_features(sb) ||
4648	     ext4_has_ro_compat_features(sb) ||
4649	     ext4_has_incompat_features(sb)))
4650		ext4_msg(sb, KERN_WARNING,
4651		       "feature flags set on rev 0 fs, "
4652		       "running e2fsck is recommended");
4653
4654	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4655		set_opt2(sb, HURD_COMPAT);
4656		if (ext4_has_feature_64bit(sb)) {
4657			ext4_msg(sb, KERN_ERR,
4658				 "The Hurd can't support 64-bit file systems");
4659			return -EINVAL;
4660		}
4661
4662		/*
4663		 * ea_inode feature uses l_i_version field which is not
4664		 * available in HURD_COMPAT mode.
4665		 */
4666		if (ext4_has_feature_ea_inode(sb)) {
4667			ext4_msg(sb, KERN_ERR,
4668				 "ea_inode feature is not supported for Hurd");
4669			return -EINVAL;
4670		}
4671	}
4672
4673	if (IS_EXT2_SB(sb)) {
4674		if (ext2_feature_set_ok(sb))
4675			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4676				 "using the ext4 subsystem");
4677		else {
4678			/*
4679			 * If we're probing be silent, if this looks like
4680			 * it's actually an ext[34] filesystem.
4681			 */
4682			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4683				return -EINVAL;
4684			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4685				 "to feature incompatibilities");
4686			return -EINVAL;
4687		}
4688	}
4689
4690	if (IS_EXT3_SB(sb)) {
4691		if (ext3_feature_set_ok(sb))
4692			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4693				 "using the ext4 subsystem");
4694		else {
4695			/*
4696			 * If we're probing be silent, if this looks like
4697			 * it's actually an ext4 filesystem.
4698			 */
4699			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4700				return -EINVAL;
4701			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4702				 "to feature incompatibilities");
4703			return -EINVAL;
4704		}
4705	}
4706
4707	/*
4708	 * Check feature flags regardless of the revision level, since we
4709	 * previously didn't change the revision level when setting the flags,
4710	 * so there is a chance incompat flags are set on a rev 0 filesystem.
4711	 */
4712	if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4713		return -EINVAL;
4714
4715	if (sbi->s_daxdev) {
4716		if (sb->s_blocksize == PAGE_SIZE)
4717			set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4718		else
4719			ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
4720	}
4721
4722	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4723		if (ext4_has_feature_inline_data(sb)) {
4724			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4725					" that may contain inline data");
4726			return -EINVAL;
4727		}
4728		if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4729			ext4_msg(sb, KERN_ERR,
4730				"DAX unsupported by block device.");
4731			return -EINVAL;
4732		}
4733	}
4734
4735	if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4736		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4737			 es->s_encryption_level);
4738		return -EINVAL;
4739	}
4740
4741	return 0;
4742}
4743
4744static int ext4_check_geometry(struct super_block *sb,
4745			       struct ext4_super_block *es)
4746{
4747	struct ext4_sb_info *sbi = EXT4_SB(sb);
4748	__u64 blocks_count;
4749	int err;
4750
4751	if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
4752		ext4_msg(sb, KERN_ERR,
4753			 "Number of reserved GDT blocks insanely large: %d",
4754			 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4755		return -EINVAL;
4756	}
4757	/*
4758	 * Test whether we have more sectors than will fit in sector_t,
4759	 * and whether the max offset is addressable by the page cache.
4760	 */
4761	err = generic_check_addressable(sb->s_blocksize_bits,
4762					ext4_blocks_count(es));
4763	if (err) {
4764		ext4_msg(sb, KERN_ERR, "filesystem"
4765			 " too large to mount safely on this system");
4766		return err;
4767	}
4768
4769	/* check blocks count against device size */
4770	blocks_count = sb_bdev_nr_blocks(sb);
4771	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4772		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4773		       "exceeds size of device (%llu blocks)",
4774		       ext4_blocks_count(es), blocks_count);
4775		return -EINVAL;
4776	}
4777
4778	/*
4779	 * It makes no sense for the first data block to be beyond the end
4780	 * of the filesystem.
4781	 */
4782	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4783		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4784			 "block %u is beyond end of filesystem (%llu)",
4785			 le32_to_cpu(es->s_first_data_block),
4786			 ext4_blocks_count(es));
4787		return -EINVAL;
4788	}
4789	if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4790	    (sbi->s_cluster_ratio == 1)) {
4791		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4792			 "block is 0 with a 1k block and cluster size");
4793		return -EINVAL;
4794	}
4795
4796	blocks_count = (ext4_blocks_count(es) -
4797			le32_to_cpu(es->s_first_data_block) +
4798			EXT4_BLOCKS_PER_GROUP(sb) - 1);
4799	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4800	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4801		ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4802		       "(block count %llu, first data block %u, "
4803		       "blocks per group %lu)", blocks_count,
4804		       ext4_blocks_count(es),
4805		       le32_to_cpu(es->s_first_data_block),
4806		       EXT4_BLOCKS_PER_GROUP(sb));
4807		return -EINVAL;
4808	}
4809	sbi->s_groups_count = blocks_count;
4810	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4811			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4812	if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4813	    le32_to_cpu(es->s_inodes_count)) {
4814		ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4815			 le32_to_cpu(es->s_inodes_count),
4816			 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4817		return -EINVAL;
4818	}
4819
4820	return 0;
4821}
4822
4823static int ext4_group_desc_init(struct super_block *sb,
4824				struct ext4_super_block *es,
4825				ext4_fsblk_t logical_sb_block,
4826				ext4_group_t *first_not_zeroed)
4827{
4828	struct ext4_sb_info *sbi = EXT4_SB(sb);
4829	unsigned int db_count;
4830	ext4_fsblk_t block;
4831	int i;
4832
4833	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4834		   EXT4_DESC_PER_BLOCK(sb);
4835	if (ext4_has_feature_meta_bg(sb)) {
4836		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4837			ext4_msg(sb, KERN_WARNING,
4838				 "first meta block group too large: %u "
4839				 "(group descriptor block count %u)",
4840				 le32_to_cpu(es->s_first_meta_bg), db_count);
4841			return -EINVAL;
4842		}
4843	}
4844	rcu_assign_pointer(sbi->s_group_desc,
4845			   kvmalloc_array(db_count,
4846					  sizeof(struct buffer_head *),
4847					  GFP_KERNEL));
4848	if (sbi->s_group_desc == NULL) {
4849		ext4_msg(sb, KERN_ERR, "not enough memory");
4850		return -ENOMEM;
4851	}
4852
4853	bgl_lock_init(sbi->s_blockgroup_lock);
4854
4855	/* Pre-read the descriptors into the buffer cache */
4856	for (i = 0; i < db_count; i++) {
4857		block = descriptor_loc(sb, logical_sb_block, i);
4858		ext4_sb_breadahead_unmovable(sb, block);
4859	}
4860
4861	for (i = 0; i < db_count; i++) {
4862		struct buffer_head *bh;
4863
4864		block = descriptor_loc(sb, logical_sb_block, i);
4865		bh = ext4_sb_bread_unmovable(sb, block);
4866		if (IS_ERR(bh)) {
4867			ext4_msg(sb, KERN_ERR,
4868			       "can't read group descriptor %d", i);
4869			sbi->s_gdb_count = i;
4870			return PTR_ERR(bh);
4871		}
4872		rcu_read_lock();
4873		rcu_dereference(sbi->s_group_desc)[i] = bh;
4874		rcu_read_unlock();
4875	}
4876	sbi->s_gdb_count = db_count;
4877	if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
4878		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4879		return -EFSCORRUPTED;
4880	}
4881
4882	return 0;
4883}
4884
4885static int ext4_load_and_init_journal(struct super_block *sb,
4886				      struct ext4_super_block *es,
4887				      struct ext4_fs_context *ctx)
4888{
4889	struct ext4_sb_info *sbi = EXT4_SB(sb);
4890	int err;
4891
4892	err = ext4_load_journal(sb, es, ctx->journal_devnum);
4893	if (err)
4894		return err;
4895
4896	if (ext4_has_feature_64bit(sb) &&
4897	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4898				       JBD2_FEATURE_INCOMPAT_64BIT)) {
4899		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4900		goto out;
4901	}
4902
4903	if (!set_journal_csum_feature_set(sb)) {
4904		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4905			 "feature set");
4906		goto out;
4907	}
4908
4909	if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
4910		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4911					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
4912		ext4_msg(sb, KERN_ERR,
4913			"Failed to set fast commit journal feature");
4914		goto out;
4915	}
4916
4917	/* We have now updated the journal if required, so we can
4918	 * validate the data journaling mode. */
4919	switch (test_opt(sb, DATA_FLAGS)) {
4920	case 0:
4921		/* No mode set, assume a default based on the journal
4922		 * capabilities: ORDERED_DATA if the journal can
4923		 * cope, else JOURNAL_DATA
4924		 */
4925		if (jbd2_journal_check_available_features
4926		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4927			set_opt(sb, ORDERED_DATA);
4928			sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4929		} else {
4930			set_opt(sb, JOURNAL_DATA);
4931			sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4932		}
4933		break;
4934
4935	case EXT4_MOUNT_ORDERED_DATA:
4936	case EXT4_MOUNT_WRITEBACK_DATA:
4937		if (!jbd2_journal_check_available_features
4938		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4939			ext4_msg(sb, KERN_ERR, "Journal does not support "
4940			       "requested data journaling mode");
4941			goto out;
4942		}
4943		break;
4944	default:
4945		break;
4946	}
4947
4948	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4949	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4950		ext4_msg(sb, KERN_ERR, "can't mount with "
4951			"journal_async_commit in data=ordered mode");
4952		goto out;
4953	}
4954
4955	set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
4956
4957	sbi->s_journal->j_submit_inode_data_buffers =
4958		ext4_journal_submit_inode_data_buffers;
4959	sbi->s_journal->j_finish_inode_data_buffers =
4960		ext4_journal_finish_inode_data_buffers;
4961
4962	return 0;
4963
4964out:
4965	/* flush s_error_work before journal destroy. */
4966	flush_work(&sbi->s_error_work);
4967	jbd2_journal_destroy(sbi->s_journal);
4968	sbi->s_journal = NULL;
4969	return -EINVAL;
4970}
4971
4972static int ext4_check_journal_data_mode(struct super_block *sb)
4973{
4974	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4975		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
4976			    "data=journal disables delayed allocation, "
4977			    "dioread_nolock, O_DIRECT and fast_commit support!\n");
4978		/* can't mount with both data=journal and dioread_nolock. */
4979		clear_opt(sb, DIOREAD_NOLOCK);
4980		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4981		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4982			ext4_msg(sb, KERN_ERR, "can't mount with "
4983				 "both data=journal and delalloc");
4984			return -EINVAL;
4985		}
4986		if (test_opt(sb, DAX_ALWAYS)) {
4987			ext4_msg(sb, KERN_ERR, "can't mount with "
4988				 "both data=journal and dax");
4989			return -EINVAL;
4990		}
4991		if (ext4_has_feature_encrypt(sb)) {
4992			ext4_msg(sb, KERN_WARNING,
4993				 "encrypted files will use data=ordered "
4994				 "instead of data journaling mode");
4995		}
4996		if (test_opt(sb, DELALLOC))
4997			clear_opt(sb, DELALLOC);
4998	} else {
4999		sb->s_iflags |= SB_I_CGROUPWB;
5000	}
5001
5002	return 0;
5003}
5004
5005static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
5006			   int silent)
5007{
5008	struct ext4_sb_info *sbi = EXT4_SB(sb);
5009	struct ext4_super_block *es;
5010	ext4_fsblk_t logical_sb_block;
5011	unsigned long offset = 0;
5012	struct buffer_head *bh;
5013	int ret = -EINVAL;
5014	int blocksize;
5015
5016	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
5017	if (!blocksize) {
5018		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
5019		return -EINVAL;
5020	}
5021
5022	/*
5023	 * The ext4 superblock will not be buffer aligned for other than 1kB
5024	 * block sizes.  We need to calculate the offset from buffer start.
5025	 */
5026	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
5027		logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5028		offset = do_div(logical_sb_block, blocksize);
5029	} else {
5030		logical_sb_block = sbi->s_sb_block;
5031	}
5032
5033	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5034	if (IS_ERR(bh)) {
5035		ext4_msg(sb, KERN_ERR, "unable to read superblock");
5036		return PTR_ERR(bh);
5037	}
5038	/*
5039	 * Note: s_es must be initialized as soon as possible because
5040	 *       some ext4 macro-instructions depend on its value
5041	 */
5042	es = (struct ext4_super_block *) (bh->b_data + offset);
5043	sbi->s_es = es;
5044	sb->s_magic = le16_to_cpu(es->s_magic);
5045	if (sb->s_magic != EXT4_SUPER_MAGIC) {
5046		if (!silent)
5047			ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5048		goto out;
5049	}
5050
5051	if (le32_to_cpu(es->s_log_block_size) >
5052	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5053		ext4_msg(sb, KERN_ERR,
5054			 "Invalid log block size: %u",
5055			 le32_to_cpu(es->s_log_block_size));
5056		goto out;
5057	}
5058	if (le32_to_cpu(es->s_log_cluster_size) >
5059	    (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5060		ext4_msg(sb, KERN_ERR,
5061			 "Invalid log cluster size: %u",
5062			 le32_to_cpu(es->s_log_cluster_size));
5063		goto out;
5064	}
5065
5066	blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
5067
5068	/*
5069	 * If the default block size is not the same as the real block size,
5070	 * we need to reload it.
5071	 */
5072	if (sb->s_blocksize == blocksize) {
5073		*lsb = logical_sb_block;
5074		sbi->s_sbh = bh;
5075		return 0;
5076	}
5077
5078	/*
5079	 * bh must be released before kill_bdev(), otherwise
5080	 * it won't be freed and its page also. kill_bdev()
5081	 * is called by sb_set_blocksize().
5082	 */
5083	brelse(bh);
5084	/* Validate the filesystem blocksize */
5085	if (!sb_set_blocksize(sb, blocksize)) {
5086		ext4_msg(sb, KERN_ERR, "bad block size %d",
5087				blocksize);
5088		bh = NULL;
5089		goto out;
5090	}
5091
5092	logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5093	offset = do_div(logical_sb_block, blocksize);
5094	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5095	if (IS_ERR(bh)) {
5096		ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
5097		ret = PTR_ERR(bh);
5098		bh = NULL;
5099		goto out;
5100	}
5101	es = (struct ext4_super_block *)(bh->b_data + offset);
5102	sbi->s_es = es;
5103	if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
5104		ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
5105		goto out;
5106	}
5107	*lsb = logical_sb_block;
5108	sbi->s_sbh = bh;
5109	return 0;
5110out:
5111	brelse(bh);
5112	return ret;
5113}
5114
5115static void ext4_hash_info_init(struct super_block *sb)
5116{
5117	struct ext4_sb_info *sbi = EXT4_SB(sb);
5118	struct ext4_super_block *es = sbi->s_es;
5119	unsigned int i;
5120
5121	for (i = 0; i < 4; i++)
5122		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
5123
5124	sbi->s_def_hash_version = es->s_def_hash_version;
5125	if (ext4_has_feature_dir_index(sb)) {
5126		i = le32_to_cpu(es->s_flags);
5127		if (i & EXT2_FLAGS_UNSIGNED_HASH)
5128			sbi->s_hash_unsigned = 3;
5129		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
5130#ifdef __CHAR_UNSIGNED__
5131			if (!sb_rdonly(sb))
5132				es->s_flags |=
5133					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
5134			sbi->s_hash_unsigned = 3;
5135#else
5136			if (!sb_rdonly(sb))
5137				es->s_flags |=
5138					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
5139#endif
5140		}
5141	}
5142}
5143
5144static int ext4_block_group_meta_init(struct super_block *sb, int silent)
5145{
5146	struct ext4_sb_info *sbi = EXT4_SB(sb);
5147	struct ext4_super_block *es = sbi->s_es;
5148	int has_huge_files;
5149
5150	has_huge_files = ext4_has_feature_huge_file(sb);
5151	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
5152						      has_huge_files);
5153	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
5154
5155	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
5156	if (ext4_has_feature_64bit(sb)) {
5157		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
5158		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
5159		    !is_power_of_2(sbi->s_desc_size)) {
5160			ext4_msg(sb, KERN_ERR,
5161			       "unsupported descriptor size %lu",
5162			       sbi->s_desc_size);
5163			return -EINVAL;
5164		}
5165	} else
5166		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
5167
5168	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
5169	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
5170
5171	sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
5172	if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
5173		if (!silent)
5174			ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5175		return -EINVAL;
5176	}
5177	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
5178	    sbi->s_inodes_per_group > sb->s_blocksize * 8) {
5179		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
5180			 sbi->s_inodes_per_group);
5181		return -EINVAL;
5182	}
5183	sbi->s_itb_per_group = sbi->s_inodes_per_group /
5184					sbi->s_inodes_per_block;
5185	sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
5186	sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
5187	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
5188	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
5189
5190	return 0;
5191}
5192
5193static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
5194{
5195	struct ext4_super_block *es = NULL;
5196	struct ext4_sb_info *sbi = EXT4_SB(sb);
5197	ext4_fsblk_t logical_sb_block;
5198	struct inode *root;
5199	int needs_recovery;
5200	int err;
5201	ext4_group_t first_not_zeroed;
5202	struct ext4_fs_context *ctx = fc->fs_private;
5203	int silent = fc->sb_flags & SB_SILENT;
5204
5205	/* Set defaults for the variables that will be set during parsing */
5206	if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
5207		ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5208
5209	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
5210	sbi->s_sectors_written_start =
5211		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
5212
5213	err = ext4_load_super(sb, &logical_sb_block, silent);
5214	if (err)
5215		goto out_fail;
5216
5217	es = sbi->s_es;
5218	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
5219
5220	err = ext4_init_metadata_csum(sb, es);
5221	if (err)
5222		goto failed_mount;
5223
5224	ext4_set_def_opts(sb, es);
5225
5226	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
5227	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
5228	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
5229	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
5230	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
5231
5232	/*
5233	 * set default s_li_wait_mult for lazyinit, for the case there is
5234	 * no mount option specified.
5235	 */
5236	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
5237
5238	err = ext4_inode_info_init(sb, es);
5239	if (err)
5240		goto failed_mount;
5241
5242	err = parse_apply_sb_mount_options(sb, ctx);
5243	if (err < 0)
5244		goto failed_mount;
5245
5246	sbi->s_def_mount_opt = sbi->s_mount_opt;
5247	sbi->s_def_mount_opt2 = sbi->s_mount_opt2;
5248
5249	err = ext4_check_opt_consistency(fc, sb);
5250	if (err < 0)
5251		goto failed_mount;
5252
5253	ext4_apply_options(fc, sb);
5254
5255	err = ext4_encoding_init(sb, es);
5256	if (err)
5257		goto failed_mount;
5258
5259	err = ext4_check_journal_data_mode(sb);
5260	if (err)
5261		goto failed_mount;
5262
5263	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5264		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5265
5266	/* i_version is always enabled now */
5267	sb->s_flags |= SB_I_VERSION;
5268
5269	err = ext4_check_feature_compatibility(sb, es, silent);
5270	if (err)
5271		goto failed_mount;
5272
5273	err = ext4_block_group_meta_init(sb, silent);
5274	if (err)
5275		goto failed_mount;
5276
5277	ext4_hash_info_init(sb);
5278
5279	err = ext4_handle_clustersize(sb);
5280	if (err)
5281		goto failed_mount;
5282
5283	err = ext4_check_geometry(sb, es);
5284	if (err)
5285		goto failed_mount;
5286
5287	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
5288	spin_lock_init(&sbi->s_error_lock);
5289	INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
5290
5291	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
5292	if (err)
5293		goto failed_mount3;
5294
5295	err = ext4_es_register_shrinker(sbi);
5296	if (err)
5297		goto failed_mount3;
5298
5299	sbi->s_stripe = ext4_get_stripe_size(sbi);
5300	sbi->s_extent_max_zeroout_kb = 32;
5301
5302	/*
5303	 * set up enough so that it can read an inode
5304	 */
5305	sb->s_op = &ext4_sops;
5306	sb->s_export_op = &ext4_export_ops;
5307	sb->s_xattr = ext4_xattr_handlers;
5308#ifdef CONFIG_FS_ENCRYPTION
5309	sb->s_cop = &ext4_cryptops;
5310#endif
5311#ifdef CONFIG_FS_VERITY
5312	sb->s_vop = &ext4_verityops;
5313#endif
5314#ifdef CONFIG_QUOTA
5315	sb->dq_op = &ext4_quota_operations;
5316	if (ext4_has_feature_quota(sb))
5317		sb->s_qcop = &dquot_quotactl_sysfile_ops;
5318	else
5319		sb->s_qcop = &ext4_qctl_operations;
5320	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
5321#endif
5322	memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
5323
5324	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
5325	mutex_init(&sbi->s_orphan_lock);
5326
5327	ext4_fast_commit_init(sb);
5328
5329	sb->s_root = NULL;
5330
5331	needs_recovery = (es->s_last_orphan != 0 ||
5332			  ext4_has_feature_orphan_present(sb) ||
5333			  ext4_has_feature_journal_needs_recovery(sb));
5334
5335	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
5336		err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
5337		if (err)
5338			goto failed_mount3a;
5339	}
5340
5341	err = -EINVAL;
5342	/*
5343	 * The first inode we look at is the journal inode.  Don't try
5344	 * root first: it may be modified in the journal!
5345	 */
5346	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
5347		err = ext4_load_and_init_journal(sb, es, ctx);
5348		if (err)
5349			goto failed_mount3a;
5350	} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
5351		   ext4_has_feature_journal_needs_recovery(sb)) {
5352		ext4_msg(sb, KERN_ERR, "required journal recovery "
5353		       "suppressed and not mounted read-only");
5354		goto failed_mount3a;
5355	} else {
5356		/* Nojournal mode, all journal mount options are illegal */
5357		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5358			ext4_msg(sb, KERN_ERR, "can't mount with "
5359				 "journal_async_commit, fs mounted w/o journal");
5360			goto failed_mount3a;
5361		}
5362
5363		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
5364			ext4_msg(sb, KERN_ERR, "can't mount with "
5365				 "journal_checksum, fs mounted w/o journal");
5366			goto failed_mount3a;
5367		}
5368		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
5369			ext4_msg(sb, KERN_ERR, "can't mount with "
5370				 "commit=%lu, fs mounted w/o journal",
5371				 sbi->s_commit_interval / HZ);
5372			goto failed_mount3a;
5373		}
5374		if (EXT4_MOUNT_DATA_FLAGS &
5375		    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
5376			ext4_msg(sb, KERN_ERR, "can't mount with "
5377				 "data=, fs mounted w/o journal");
5378			goto failed_mount3a;
5379		}
5380		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
5381		clear_opt(sb, JOURNAL_CHECKSUM);
5382		clear_opt(sb, DATA_FLAGS);
5383		clear_opt2(sb, JOURNAL_FAST_COMMIT);
5384		sbi->s_journal = NULL;
5385		needs_recovery = 0;
5386	}
5387
5388	if (!test_opt(sb, NO_MBCACHE)) {
5389		sbi->s_ea_block_cache = ext4_xattr_create_cache();
5390		if (!sbi->s_ea_block_cache) {
5391			ext4_msg(sb, KERN_ERR,
5392				 "Failed to create ea_block_cache");
5393			err = -EINVAL;
5394			goto failed_mount_wq;
5395		}
5396
5397		if (ext4_has_feature_ea_inode(sb)) {
5398			sbi->s_ea_inode_cache = ext4_xattr_create_cache();
5399			if (!sbi->s_ea_inode_cache) {
5400				ext4_msg(sb, KERN_ERR,
5401					 "Failed to create ea_inode_cache");
5402				err = -EINVAL;
5403				goto failed_mount_wq;
5404			}
5405		}
5406	}
5407
5408	/*
5409	 * Get the # of file system overhead blocks from the
5410	 * superblock if present.
5411	 */
5412	sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
5413	/* ignore the precalculated value if it is ridiculous */
5414	if (sbi->s_overhead > ext4_blocks_count(es))
5415		sbi->s_overhead = 0;
5416	/*
5417	 * If the bigalloc feature is not enabled recalculating the
5418	 * overhead doesn't take long, so we might as well just redo
5419	 * it to make sure we are using the correct value.
5420	 */
5421	if (!ext4_has_feature_bigalloc(sb))
5422		sbi->s_overhead = 0;
5423	if (sbi->s_overhead == 0) {
5424		err = ext4_calculate_overhead(sb);
5425		if (err)
5426			goto failed_mount_wq;
5427	}
5428
5429	/*
5430	 * The maximum number of concurrent works can be high and
5431	 * concurrency isn't really necessary.  Limit it to 1.
5432	 */
5433	EXT4_SB(sb)->rsv_conversion_wq =
5434		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
5435	if (!EXT4_SB(sb)->rsv_conversion_wq) {
5436		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
5437		err = -ENOMEM;
5438		goto failed_mount4;
5439	}
5440
5441	/*
5442	 * The jbd2_journal_load will have done any necessary log recovery,
5443	 * so we can safely mount the rest of the filesystem now.
5444	 */
5445
5446	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
5447	if (IS_ERR(root)) {
5448		ext4_msg(sb, KERN_ERR, "get root inode failed");
5449		err = PTR_ERR(root);
5450		root = NULL;
5451		goto failed_mount4;
5452	}
5453	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
5454		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
5455		iput(root);
5456		err = -EFSCORRUPTED;
5457		goto failed_mount4;
5458	}
5459
5460	sb->s_root = d_make_root(root);
5461	if (!sb->s_root) {
5462		ext4_msg(sb, KERN_ERR, "get root dentry failed");
5463		err = -ENOMEM;
5464		goto failed_mount4;
5465	}
5466
5467	err = ext4_setup_super(sb, es, sb_rdonly(sb));
5468	if (err == -EROFS) {
5469		sb->s_flags |= SB_RDONLY;
5470	} else if (err)
5471		goto failed_mount4a;
5472
5473	ext4_set_resv_clusters(sb);
5474
5475	if (test_opt(sb, BLOCK_VALIDITY)) {
5476		err = ext4_setup_system_zone(sb);
5477		if (err) {
5478			ext4_msg(sb, KERN_ERR, "failed to initialize system "
5479				 "zone (%d)", err);
5480			goto failed_mount4a;
5481		}
5482	}
5483	ext4_fc_replay_cleanup(sb);
5484
5485	ext4_ext_init(sb);
5486
5487	/*
5488	 * Enable optimize_scan if number of groups is > threshold. This can be
5489	 * turned off by passing "mb_optimize_scan=0". This can also be
5490	 * turned on forcefully by passing "mb_optimize_scan=1".
5491	 */
5492	if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
5493		if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
5494			set_opt2(sb, MB_OPTIMIZE_SCAN);
5495		else
5496			clear_opt2(sb, MB_OPTIMIZE_SCAN);
5497	}
5498
5499	err = ext4_mb_init(sb);
5500	if (err) {
5501		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
5502			 err);
5503		goto failed_mount5;
5504	}
5505
5506	/*
5507	 * We can only set up the journal commit callback once
5508	 * mballoc is initialized
5509	 */
5510	if (sbi->s_journal)
5511		sbi->s_journal->j_commit_callback =
5512			ext4_journal_commit_callback;
5513
5514	err = ext4_percpu_param_init(sbi);
5515	if (err)
5516		goto failed_mount6;
5517
5518	if (ext4_has_feature_flex_bg(sb))
5519		if (!ext4_fill_flex_info(sb)) {
5520			ext4_msg(sb, KERN_ERR,
5521			       "unable to initialize "
5522			       "flex_bg meta info!");
5523			err = -ENOMEM;
5524			goto failed_mount6;
5525		}
5526
5527	err = ext4_register_li_request(sb, first_not_zeroed);
5528	if (err)
5529		goto failed_mount6;
5530
5531	err = ext4_register_sysfs(sb);
5532	if (err)
5533		goto failed_mount7;
5534
5535	err = ext4_init_orphan_info(sb);
5536	if (err)
5537		goto failed_mount8;
5538#ifdef CONFIG_QUOTA
5539	/* Enable quota usage during mount. */
5540	if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
5541		err = ext4_enable_quotas(sb);
5542		if (err)
5543			goto failed_mount9;
5544	}
5545#endif  /* CONFIG_QUOTA */
5546
5547	/*
5548	 * Save the original bdev mapping's wb_err value which could be
5549	 * used to detect the metadata async write error.
5550	 */
5551	spin_lock_init(&sbi->s_bdev_wb_lock);
5552	errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
5553				 &sbi->s_bdev_wb_err);
5554	sb->s_bdev->bd_super = sb;
5555	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
5556	ext4_orphan_cleanup(sb, es);
5557	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
5558	/*
5559	 * Update the checksum after updating free space/inode counters and
5560	 * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
5561	 * checksum in the buffer cache until it is written out and
5562	 * e2fsprogs programs trying to open a file system immediately
5563	 * after it is mounted can fail.
5564	 */
5565	ext4_superblock_csum_set(sb);
5566	if (needs_recovery) {
5567		ext4_msg(sb, KERN_INFO, "recovery complete");
5568		err = ext4_mark_recovery_complete(sb, es);
5569		if (err)
5570			goto failed_mount9;
5571	}
5572
5573	if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
5574		ext4_msg(sb, KERN_WARNING,
5575			 "mounting with \"discard\" option, but the device does not support discard");
5576
5577	if (es->s_error_count)
5578		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
5579
5580	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
5581	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
5582	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
5583	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
5584	atomic_set(&sbi->s_warning_count, 0);
5585	atomic_set(&sbi->s_msg_count, 0);
5586
5587	return 0;
5588
5589failed_mount9:
5590	ext4_release_orphan_info(sb);
5591failed_mount8:
5592	ext4_unregister_sysfs(sb);
5593	kobject_put(&sbi->s_kobj);
5594failed_mount7:
5595	ext4_unregister_li_request(sb);
5596failed_mount6:
5597	ext4_mb_release(sb);
5598	ext4_flex_groups_free(sbi);
5599	ext4_percpu_param_destroy(sbi);
5600failed_mount5:
5601	ext4_ext_release(sb);
5602	ext4_release_system_zone(sb);
5603failed_mount4a:
5604	dput(sb->s_root);
5605	sb->s_root = NULL;
5606failed_mount4:
5607	ext4_msg(sb, KERN_ERR, "mount failed");
5608	if (EXT4_SB(sb)->rsv_conversion_wq)
5609		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
5610failed_mount_wq:
5611	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
5612	sbi->s_ea_inode_cache = NULL;
5613
5614	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
5615	sbi->s_ea_block_cache = NULL;
5616
5617	if (sbi->s_journal) {
5618		/* flush s_error_work before journal destroy. */
5619		flush_work(&sbi->s_error_work);
5620		jbd2_journal_destroy(sbi->s_journal);
5621		sbi->s_journal = NULL;
5622	}
5623failed_mount3a:
5624	ext4_es_unregister_shrinker(sbi);
5625failed_mount3:
5626	/* flush s_error_work before sbi destroy */
5627	flush_work(&sbi->s_error_work);
5628	del_timer_sync(&sbi->s_err_report);
5629	ext4_stop_mmpd(sbi);
5630	ext4_group_desc_free(sbi);
5631failed_mount:
5632	if (sbi->s_chksum_driver)
5633		crypto_free_shash(sbi->s_chksum_driver);
5634
5635#if IS_ENABLED(CONFIG_UNICODE)
5636	utf8_unload(sb->s_encoding);
5637#endif
5638
5639#ifdef CONFIG_QUOTA
5640	for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
5641		kfree(get_qf_name(sb, sbi, i));
5642#endif
5643	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
5644	/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
5645	brelse(sbi->s_sbh);
5646	ext4_blkdev_remove(sbi);
5647out_fail:
5648	sb->s_fs_info = NULL;
5649	return err;
5650}
5651
5652static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
5653{
5654	struct ext4_fs_context *ctx = fc->fs_private;
5655	struct ext4_sb_info *sbi;
5656	const char *descr;
5657	int ret;
5658
5659	sbi = ext4_alloc_sbi(sb);
5660	if (!sbi)
5661		return -ENOMEM;
5662
5663	fc->s_fs_info = sbi;
5664
5665	/* Cleanup superblock name */
5666	strreplace(sb->s_id, '/', '!');
5667
5668	sbi->s_sb_block = 1;	/* Default super block location */
5669	if (ctx->spec & EXT4_SPEC_s_sb_block)
5670		sbi->s_sb_block = ctx->s_sb_block;
5671
5672	ret = __ext4_fill_super(fc, sb);
5673	if (ret < 0)
5674		goto free_sbi;
5675
5676	if (sbi->s_journal) {
5677		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
5678			descr = " journalled data mode";
5679		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
5680			descr = " ordered data mode";
5681		else
5682			descr = " writeback data mode";
5683	} else
5684		descr = "out journal";
5685
5686	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
5687		ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
5688			 "Quota mode: %s.", &sb->s_uuid,
5689			 sb_rdonly(sb) ? "ro" : "r/w", descr,
5690			 ext4_quota_mode(sb));
5691
5692	/* Update the s_overhead_clusters if necessary */
5693	ext4_update_overhead(sb, false);
5694	return 0;
5695
5696free_sbi:
5697	ext4_free_sbi(sbi);
5698	fc->s_fs_info = NULL;
5699	return ret;
5700}
5701
5702static int ext4_get_tree(struct fs_context *fc)
5703{
5704	return get_tree_bdev(fc, ext4_fill_super);
5705}
5706
5707/*
5708 * Setup any per-fs journal parameters now.  We'll do this both on
5709 * initial mount, once the journal has been initialised but before we've
5710 * done any recovery; and again on any subsequent remount.
5711 */
5712static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
5713{
5714	struct ext4_sb_info *sbi = EXT4_SB(sb);
5715
5716	journal->j_commit_interval = sbi->s_commit_interval;
5717	journal->j_min_batch_time = sbi->s_min_batch_time;
5718	journal->j_max_batch_time = sbi->s_max_batch_time;
5719	ext4_fc_init(sb, journal);
5720
5721	write_lock(&journal->j_state_lock);
5722	if (test_opt(sb, BARRIER))
5723		journal->j_flags |= JBD2_BARRIER;
5724	else
5725		journal->j_flags &= ~JBD2_BARRIER;
5726	if (test_opt(sb, DATA_ERR_ABORT))
5727		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
5728	else
5729		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
5730	write_unlock(&journal->j_state_lock);
5731}
5732
5733static struct inode *ext4_get_journal_inode(struct super_block *sb,
5734					     unsigned int journal_inum)
5735{
5736	struct inode *journal_inode;
5737
5738	/*
5739	 * Test for the existence of a valid inode on disk.  Bad things
5740	 * happen if we iget() an unused inode, as the subsequent iput()
5741	 * will try to delete it.
5742	 */
5743	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
5744	if (IS_ERR(journal_inode)) {
5745		ext4_msg(sb, KERN_ERR, "no journal found");
5746		return NULL;
5747	}
5748	if (!journal_inode->i_nlink) {
5749		make_bad_inode(journal_inode);
5750		iput(journal_inode);
5751		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
5752		return NULL;
5753	}
5754
5755	ext4_debug("Journal inode found at %p: %lld bytes\n",
5756		  journal_inode, journal_inode->i_size);
5757	if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
5758		ext4_msg(sb, KERN_ERR, "invalid journal inode");
5759		iput(journal_inode);
5760		return NULL;
5761	}
5762	return journal_inode;
5763}
5764
5765static int ext4_journal_bmap(journal_t *journal, sector_t *block)
5766{
5767	struct ext4_map_blocks map;
5768	int ret;
5769
5770	if (journal->j_inode == NULL)
5771		return 0;
5772
5773	map.m_lblk = *block;
5774	map.m_len = 1;
5775	ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
5776	if (ret <= 0) {
5777		ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
5778			 "journal bmap failed: block %llu ret %d\n",
5779			 *block, ret);
5780		jbd2_journal_abort(journal, ret ? ret : -EIO);
5781		return ret;
5782	}
5783	*block = map.m_pblk;
5784	return 0;
5785}
5786
5787static journal_t *ext4_get_journal(struct super_block *sb,
5788				   unsigned int journal_inum)
5789{
5790	struct inode *journal_inode;
5791	journal_t *journal;
5792
5793	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5794		return NULL;
5795
5796	journal_inode = ext4_get_journal_inode(sb, journal_inum);
5797	if (!journal_inode)
5798		return NULL;
5799
5800	journal = jbd2_journal_init_inode(journal_inode);
5801	if (!journal) {
5802		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
5803		iput(journal_inode);
5804		return NULL;
5805	}
5806	journal->j_private = sb;
5807	journal->j_bmap = ext4_journal_bmap;
5808	ext4_init_journal_params(sb, journal);
5809	return journal;
5810}
5811
5812static journal_t *ext4_get_dev_journal(struct super_block *sb,
5813				       dev_t j_dev)
5814{
5815	struct buffer_head *bh;
5816	journal_t *journal;
5817	ext4_fsblk_t start;
5818	ext4_fsblk_t len;
5819	int hblock, blocksize;
5820	ext4_fsblk_t sb_block;
5821	unsigned long offset;
5822	struct ext4_super_block *es;
5823	struct block_device *bdev;
5824
5825	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5826		return NULL;
5827
5828	bdev = ext4_blkdev_get(j_dev, sb);
5829	if (bdev == NULL)
5830		return NULL;
5831
5832	blocksize = sb->s_blocksize;
5833	hblock = bdev_logical_block_size(bdev);
5834	if (blocksize < hblock) {
5835		ext4_msg(sb, KERN_ERR,
5836			"blocksize too small for journal device");
5837		goto out_bdev;
5838	}
5839
5840	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
5841	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
5842	set_blocksize(bdev, blocksize);
5843	if (!(bh = __bread(bdev, sb_block, blocksize))) {
5844		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
5845		       "external journal");
5846		goto out_bdev;
5847	}
5848
5849	es = (struct ext4_super_block *) (bh->b_data + offset);
5850	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
5851	    !(le32_to_cpu(es->s_feature_incompat) &
5852	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
5853		ext4_msg(sb, KERN_ERR, "external journal has "
5854					"bad superblock");
5855		brelse(bh);
5856		goto out_bdev;
5857	}
5858
5859	if ((le32_to_cpu(es->s_feature_ro_compat) &
5860	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
5861	    es->s_checksum != ext4_superblock_csum(sb, es)) {
5862		ext4_msg(sb, KERN_ERR, "external journal has "
5863				       "corrupt superblock");
5864		brelse(bh);
5865		goto out_bdev;
5866	}
5867
5868	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
5869		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
5870		brelse(bh);
5871		goto out_bdev;
5872	}
5873
5874	len = ext4_blocks_count(es);
5875	start = sb_block + 1;
5876	brelse(bh);	/* we're done with the superblock */
5877
5878	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
5879					start, len, blocksize);
5880	if (!journal) {
5881		ext4_msg(sb, KERN_ERR, "failed to create device journal");
5882		goto out_bdev;
5883	}
5884	journal->j_private = sb;
5885	if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
5886		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
5887		goto out_journal;
5888	}
5889	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
5890		ext4_msg(sb, KERN_ERR, "External journal has more than one "
5891					"user (unsupported) - %d",
5892			be32_to_cpu(journal->j_superblock->s_nr_users));
5893		goto out_journal;
5894	}
5895	EXT4_SB(sb)->s_journal_bdev = bdev;
5896	ext4_init_journal_params(sb, journal);
5897	return journal;
5898
5899out_journal:
5900	jbd2_journal_destroy(journal);
5901out_bdev:
5902	ext4_blkdev_put(bdev);
5903	return NULL;
5904}
5905
5906static int ext4_load_journal(struct super_block *sb,
5907			     struct ext4_super_block *es,
5908			     unsigned long journal_devnum)
5909{
5910	journal_t *journal;
5911	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
5912	dev_t journal_dev;
5913	int err = 0;
5914	int really_read_only;
5915	int journal_dev_ro;
5916
5917	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5918		return -EFSCORRUPTED;
5919
5920	if (journal_devnum &&
5921	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5922		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
5923			"numbers have changed");
5924		journal_dev = new_decode_dev(journal_devnum);
5925	} else
5926		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
5927
5928	if (journal_inum && journal_dev) {
5929		ext4_msg(sb, KERN_ERR,
5930			 "filesystem has both journal inode and journal device!");
5931		return -EINVAL;
5932	}
5933
5934	if (journal_inum) {
5935		journal = ext4_get_journal(sb, journal_inum);
5936		if (!journal)
5937			return -EINVAL;
5938	} else {
5939		journal = ext4_get_dev_journal(sb, journal_dev);
5940		if (!journal)
5941			return -EINVAL;
5942	}
5943
5944	journal_dev_ro = bdev_read_only(journal->j_dev);
5945	really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
5946
5947	if (journal_dev_ro && !sb_rdonly(sb)) {
5948		ext4_msg(sb, KERN_ERR,
5949			 "journal device read-only, try mounting with '-o ro'");
5950		err = -EROFS;
5951		goto err_out;
5952	}
5953
5954	/*
5955	 * Are we loading a blank journal or performing recovery after a
5956	 * crash?  For recovery, we need to check in advance whether we
5957	 * can get read-write access to the device.
5958	 */
5959	if (ext4_has_feature_journal_needs_recovery(sb)) {
5960		if (sb_rdonly(sb)) {
5961			ext4_msg(sb, KERN_INFO, "INFO: recovery "
5962					"required on readonly filesystem");
5963			if (really_read_only) {
5964				ext4_msg(sb, KERN_ERR, "write access "
5965					"unavailable, cannot proceed "
5966					"(try mounting with noload)");
5967				err = -EROFS;
5968				goto err_out;
5969			}
5970			ext4_msg(sb, KERN_INFO, "write access will "
5971			       "be enabled during recovery");
5972		}
5973	}
5974
5975	if (!(journal->j_flags & JBD2_BARRIER))
5976		ext4_msg(sb, KERN_INFO, "barriers disabled");
5977
5978	if (!ext4_has_feature_journal_needs_recovery(sb))
5979		err = jbd2_journal_wipe(journal, !really_read_only);
5980	if (!err) {
5981		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
5982
5983		if (save)
5984			memcpy(save, ((char *) es) +
5985			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
5986		err = jbd2_journal_load(journal);
5987		if (save)
5988			memcpy(((char *) es) + EXT4_S_ERR_START,
5989			       save, EXT4_S_ERR_LEN);
5990		kfree(save);
5991		es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
5992					   EXT4_ERROR_FS);
5993		/* Write out restored error information to the superblock */
5994		if (!bdev_read_only(sb->s_bdev)) {
5995			int err2;
5996			err2 = ext4_commit_super(sb);
5997			err = err ? : err2;
5998		}
5999	}
6000
6001	if (err) {
6002		ext4_msg(sb, KERN_ERR, "error loading journal");
6003		goto err_out;
6004	}
6005
6006	EXT4_SB(sb)->s_journal = journal;
6007	err = ext4_clear_journal_err(sb, es);
6008	if (err) {
6009		EXT4_SB(sb)->s_journal = NULL;
6010		jbd2_journal_destroy(journal);
6011		return err;
6012	}
6013
6014	if (!really_read_only && journal_devnum &&
6015	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
6016		es->s_journal_dev = cpu_to_le32(journal_devnum);
6017		ext4_commit_super(sb);
6018	}
6019	if (!really_read_only && journal_inum &&
6020	    journal_inum != le32_to_cpu(es->s_journal_inum)) {
6021		es->s_journal_inum = cpu_to_le32(journal_inum);
6022		ext4_commit_super(sb);
6023	}
6024
6025	return 0;
6026
6027err_out:
6028	jbd2_journal_destroy(journal);
6029	return err;
6030}
6031
6032/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
6033static void ext4_update_super(struct super_block *sb)
6034{
6035	struct ext4_sb_info *sbi = EXT4_SB(sb);
6036	struct ext4_super_block *es = sbi->s_es;
6037	struct buffer_head *sbh = sbi->s_sbh;
6038
6039	lock_buffer(sbh);
6040	/*
6041	 * If the file system is mounted read-only, don't update the
6042	 * superblock write time.  This avoids updating the superblock
6043	 * write time when we are mounting the root file system
6044	 * read/only but we need to replay the journal; at that point,
6045	 * for people who are east of GMT and who make their clock
6046	 * tick in localtime for Windows bug-for-bug compatibility,
6047	 * the clock is set in the future, and this will cause e2fsck
6048	 * to complain and force a full file system check.
6049	 */
6050	if (!(sb->s_flags & SB_RDONLY))
6051		ext4_update_tstamp(es, s_wtime);
6052	es->s_kbytes_written =
6053		cpu_to_le64(sbi->s_kbytes_written +
6054		    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
6055		      sbi->s_sectors_written_start) >> 1));
6056	if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
6057		ext4_free_blocks_count_set(es,
6058			EXT4_C2B(sbi, percpu_counter_sum_positive(
6059				&sbi->s_freeclusters_counter)));
6060	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
6061		es->s_free_inodes_count =
6062			cpu_to_le32(percpu_counter_sum_positive(
6063				&sbi->s_freeinodes_counter));
6064	/* Copy error information to the on-disk superblock */
6065	spin_lock(&sbi->s_error_lock);
6066	if (sbi->s_add_error_count > 0) {
6067		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6068		if (!es->s_first_error_time && !es->s_first_error_time_hi) {
6069			__ext4_update_tstamp(&es->s_first_error_time,
6070					     &es->s_first_error_time_hi,
6071					     sbi->s_first_error_time);
6072			strncpy(es->s_first_error_func, sbi->s_first_error_func,
6073				sizeof(es->s_first_error_func));
6074			es->s_first_error_line =
6075				cpu_to_le32(sbi->s_first_error_line);
6076			es->s_first_error_ino =
6077				cpu_to_le32(sbi->s_first_error_ino);
6078			es->s_first_error_block =
6079				cpu_to_le64(sbi->s_first_error_block);
6080			es->s_first_error_errcode =
6081				ext4_errno_to_code(sbi->s_first_error_code);
6082		}
6083		__ext4_update_tstamp(&es->s_last_error_time,
6084				     &es->s_last_error_time_hi,
6085				     sbi->s_last_error_time);
6086		strncpy(es->s_last_error_func, sbi->s_last_error_func,
6087			sizeof(es->s_last_error_func));
6088		es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
6089		es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
6090		es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
6091		es->s_last_error_errcode =
6092				ext4_errno_to_code(sbi->s_last_error_code);
6093		/*
6094		 * Start the daily error reporting function if it hasn't been
6095		 * started already
6096		 */
6097		if (!es->s_error_count)
6098			mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
6099		le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
6100		sbi->s_add_error_count = 0;
6101	}
6102	spin_unlock(&sbi->s_error_lock);
6103
6104	ext4_superblock_csum_set(sb);
6105	unlock_buffer(sbh);
6106}
6107
6108static int ext4_commit_super(struct super_block *sb)
6109{
6110	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
6111
6112	if (!sbh)
6113		return -EINVAL;
6114	if (block_device_ejected(sb))
6115		return -ENODEV;
6116
6117	ext4_update_super(sb);
6118
6119	lock_buffer(sbh);
6120	/* Buffer got discarded which means block device got invalidated */
6121	if (!buffer_mapped(sbh)) {
6122		unlock_buffer(sbh);
6123		return -EIO;
6124	}
6125
6126	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
6127		/*
6128		 * Oh, dear.  A previous attempt to write the
6129		 * superblock failed.  This could happen because the
6130		 * USB device was yanked out.  Or it could happen to
6131		 * be a transient write error and maybe the block will
6132		 * be remapped.  Nothing we can do but to retry the
6133		 * write and hope for the best.
6134		 */
6135		ext4_msg(sb, KERN_ERR, "previous I/O error to "
6136		       "superblock detected");
6137		clear_buffer_write_io_error(sbh);
6138		set_buffer_uptodate(sbh);
6139	}
6140	get_bh(sbh);
6141	/* Clear potential dirty bit if it was journalled update */
6142	clear_buffer_dirty(sbh);
6143	sbh->b_end_io = end_buffer_write_sync;
6144	submit_bh(REQ_OP_WRITE | REQ_SYNC |
6145		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
6146	wait_on_buffer(sbh);
6147	if (buffer_write_io_error(sbh)) {
6148		ext4_msg(sb, KERN_ERR, "I/O error while writing "
6149		       "superblock");
6150		clear_buffer_write_io_error(sbh);
6151		set_buffer_uptodate(sbh);
6152		return -EIO;
6153	}
6154	return 0;
6155}
6156
6157/*
6158 * Have we just finished recovery?  If so, and if we are mounting (or
6159 * remounting) the filesystem readonly, then we will end up with a
6160 * consistent fs on disk.  Record that fact.
6161 */
6162static int ext4_mark_recovery_complete(struct super_block *sb,
6163				       struct ext4_super_block *es)
6164{
6165	int err;
6166	journal_t *journal = EXT4_SB(sb)->s_journal;
6167
6168	if (!ext4_has_feature_journal(sb)) {
6169		if (journal != NULL) {
6170			ext4_error(sb, "Journal got removed while the fs was "
6171				   "mounted!");
6172			return -EFSCORRUPTED;
6173		}
6174		return 0;
6175	}
6176	jbd2_journal_lock_updates(journal);
6177	err = jbd2_journal_flush(journal, 0);
6178	if (err < 0)
6179		goto out;
6180
6181	if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
6182	    ext4_has_feature_orphan_present(sb))) {
6183		if (!ext4_orphan_file_empty(sb)) {
6184			ext4_error(sb, "Orphan file not empty on read-only fs.");
6185			err = -EFSCORRUPTED;
6186			goto out;
6187		}
6188		ext4_clear_feature_journal_needs_recovery(sb);
6189		ext4_clear_feature_orphan_present(sb);
6190		ext4_commit_super(sb);
6191	}
6192out:
6193	jbd2_journal_unlock_updates(journal);
6194	return err;
6195}
6196
6197/*
6198 * If we are mounting (or read-write remounting) a filesystem whose journal
6199 * has recorded an error from a previous lifetime, move that error to the
6200 * main filesystem now.
6201 */
6202static int ext4_clear_journal_err(struct super_block *sb,
6203				   struct ext4_super_block *es)
6204{
6205	journal_t *journal;
6206	int j_errno;
6207	const char *errstr;
6208
6209	if (!ext4_has_feature_journal(sb)) {
6210		ext4_error(sb, "Journal got removed while the fs was mounted!");
6211		return -EFSCORRUPTED;
6212	}
6213
6214	journal = EXT4_SB(sb)->s_journal;
6215
6216	/*
6217	 * Now check for any error status which may have been recorded in the
6218	 * journal by a prior ext4_error() or ext4_abort()
6219	 */
6220
6221	j_errno = jbd2_journal_errno(journal);
6222	if (j_errno) {
6223		char nbuf[16];
6224
6225		errstr = ext4_decode_error(sb, j_errno, nbuf);
6226		ext4_warning(sb, "Filesystem error recorded "
6227			     "from previous mount: %s", errstr);
6228
6229		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
6230		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6231		j_errno = ext4_commit_super(sb);
6232		if (j_errno)
6233			return j_errno;
6234		ext4_warning(sb, "Marked fs in need of filesystem check.");
6235
6236		jbd2_journal_clear_err(journal);
6237		jbd2_journal_update_sb_errno(journal);
6238	}
6239	return 0;
6240}
6241
6242/*
6243 * Force the running and committing transactions to commit,
6244 * and wait on the commit.
6245 */
6246int ext4_force_commit(struct super_block *sb)
6247{
6248	journal_t *journal;
6249
6250	if (sb_rdonly(sb))
6251		return 0;
6252
6253	journal = EXT4_SB(sb)->s_journal;
6254	return ext4_journal_force_commit(journal);
6255}
6256
6257static int ext4_sync_fs(struct super_block *sb, int wait)
6258{
6259	int ret = 0;
6260	tid_t target;
6261	bool needs_barrier = false;
6262	struct ext4_sb_info *sbi = EXT4_SB(sb);
6263
6264	if (unlikely(ext4_forced_shutdown(sbi)))
6265		return 0;
6266
6267	trace_ext4_sync_fs(sb, wait);
6268	flush_workqueue(sbi->rsv_conversion_wq);
6269	/*
6270	 * Writeback quota in non-journalled quota case - journalled quota has
6271	 * no dirty dquots
6272	 */
6273	dquot_writeback_dquots(sb, -1);
6274	/*
6275	 * Data writeback is possible w/o journal transaction, so barrier must
6276	 * being sent at the end of the function. But we can skip it if
6277	 * transaction_commit will do it for us.
6278	 */
6279	if (sbi->s_journal) {
6280		target = jbd2_get_latest_transaction(sbi->s_journal);
6281		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
6282		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
6283			needs_barrier = true;
6284
6285		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
6286			if (wait)
6287				ret = jbd2_log_wait_commit(sbi->s_journal,
6288							   target);
6289		}
6290	} else if (wait && test_opt(sb, BARRIER))
6291		needs_barrier = true;
6292	if (needs_barrier) {
6293		int err;
6294		err = blkdev_issue_flush(sb->s_bdev);
6295		if (!ret)
6296			ret = err;
6297	}
6298
6299	return ret;
6300}
6301
6302/*
6303 * LVM calls this function before a (read-only) snapshot is created.  This
6304 * gives us a chance to flush the journal completely and mark the fs clean.
6305 *
6306 * Note that only this function cannot bring a filesystem to be in a clean
6307 * state independently. It relies on upper layer to stop all data & metadata
6308 * modifications.
6309 */
6310static int ext4_freeze(struct super_block *sb)
6311{
6312	int error = 0;
6313	journal_t *journal;
6314
6315	if (sb_rdonly(sb))
6316		return 0;
6317
6318	journal = EXT4_SB(sb)->s_journal;
6319
6320	if (journal) {
6321		/* Now we set up the journal barrier. */
6322		jbd2_journal_lock_updates(journal);
6323
6324		/*
6325		 * Don't clear the needs_recovery flag if we failed to
6326		 * flush the journal.
6327		 */
6328		error = jbd2_journal_flush(journal, 0);
6329		if (error < 0)
6330			goto out;
6331
6332		/* Journal blocked and flushed, clear needs_recovery flag. */
6333		ext4_clear_feature_journal_needs_recovery(sb);
6334		if (ext4_orphan_file_empty(sb))
6335			ext4_clear_feature_orphan_present(sb);
6336	}
6337
6338	error = ext4_commit_super(sb);
6339out:
6340	if (journal)
6341		/* we rely on upper layer to stop further updates */
6342		jbd2_journal_unlock_updates(journal);
6343	return error;
6344}
6345
6346/*
6347 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
6348 * flag here, even though the filesystem is not technically dirty yet.
6349 */
6350static int ext4_unfreeze(struct super_block *sb)
6351{
6352	if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
6353		return 0;
6354
6355	if (EXT4_SB(sb)->s_journal) {
6356		/* Reset the needs_recovery flag before the fs is unlocked. */
6357		ext4_set_feature_journal_needs_recovery(sb);
6358		if (ext4_has_feature_orphan_file(sb))
6359			ext4_set_feature_orphan_present(sb);
6360	}
6361
6362	ext4_commit_super(sb);
6363	return 0;
6364}
6365
6366/*
6367 * Structure to save mount options for ext4_remount's benefit
6368 */
6369struct ext4_mount_options {
6370	unsigned long s_mount_opt;
6371	unsigned long s_mount_opt2;
6372	kuid_t s_resuid;
6373	kgid_t s_resgid;
6374	unsigned long s_commit_interval;
6375	u32 s_min_batch_time, s_max_batch_time;
6376#ifdef CONFIG_QUOTA
6377	int s_jquota_fmt;
6378	char *s_qf_names[EXT4_MAXQUOTAS];
6379#endif
6380};
6381
6382static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
6383{
6384	struct ext4_fs_context *ctx = fc->fs_private;
6385	struct ext4_super_block *es;
6386	struct ext4_sb_info *sbi = EXT4_SB(sb);
6387	unsigned long old_sb_flags;
6388	struct ext4_mount_options old_opts;
6389	ext4_group_t g;
6390	int err = 0;
6391#ifdef CONFIG_QUOTA
6392	int enable_quota = 0;
6393	int i, j;
6394	char *to_free[EXT4_MAXQUOTAS];
6395#endif
6396
6397
6398	/* Store the original options */
6399	old_sb_flags = sb->s_flags;
6400	old_opts.s_mount_opt = sbi->s_mount_opt;
6401	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
6402	old_opts.s_resuid = sbi->s_resuid;
6403	old_opts.s_resgid = sbi->s_resgid;
6404	old_opts.s_commit_interval = sbi->s_commit_interval;
6405	old_opts.s_min_batch_time = sbi->s_min_batch_time;
6406	old_opts.s_max_batch_time = sbi->s_max_batch_time;
6407#ifdef CONFIG_QUOTA
6408	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
6409	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6410		if (sbi->s_qf_names[i]) {
6411			char *qf_name = get_qf_name(sb, sbi, i);
6412
6413			old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
6414			if (!old_opts.s_qf_names[i]) {
6415				for (j = 0; j < i; j++)
6416					kfree(old_opts.s_qf_names[j]);
6417				return -ENOMEM;
6418			}
6419		} else
6420			old_opts.s_qf_names[i] = NULL;
6421#endif
6422	if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
6423		if (sbi->s_journal && sbi->s_journal->j_task->io_context)
6424			ctx->journal_ioprio =
6425				sbi->s_journal->j_task->io_context->ioprio;
6426		else
6427			ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
6428
6429	}
6430
6431	ext4_apply_options(fc, sb);
6432
6433	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
6434	    test_opt(sb, JOURNAL_CHECKSUM)) {
6435		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
6436			 "during remount not supported; ignoring");
6437		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
6438	}
6439
6440	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
6441		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
6442			ext4_msg(sb, KERN_ERR, "can't mount with "
6443				 "both data=journal and delalloc");
6444			err = -EINVAL;
6445			goto restore_opts;
6446		}
6447		if (test_opt(sb, DIOREAD_NOLOCK)) {
6448			ext4_msg(sb, KERN_ERR, "can't mount with "
6449				 "both data=journal and dioread_nolock");
6450			err = -EINVAL;
6451			goto restore_opts;
6452		}
6453	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
6454		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
6455			ext4_msg(sb, KERN_ERR, "can't mount with "
6456				"journal_async_commit in data=ordered mode");
6457			err = -EINVAL;
6458			goto restore_opts;
6459		}
6460	}
6461
6462	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
6463		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
6464		err = -EINVAL;
6465		goto restore_opts;
6466	}
6467
6468	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
6469		ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
6470
6471	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
6472		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
6473
6474	es = sbi->s_es;
6475
6476	if (sbi->s_journal) {
6477		ext4_init_journal_params(sb, sbi->s_journal);
6478		set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
6479	}
6480
6481	/* Flush outstanding errors before changing fs state */
6482	flush_work(&sbi->s_error_work);
6483
6484	if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
6485		if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
6486			err = -EROFS;
6487			goto restore_opts;
6488		}
6489
6490		if (fc->sb_flags & SB_RDONLY) {
6491			err = sync_filesystem(sb);
6492			if (err < 0)
6493				goto restore_opts;
6494			err = dquot_suspend(sb, -1);
6495			if (err < 0)
6496				goto restore_opts;
6497
6498			/*
6499			 * First of all, the unconditional stuff we have to do
6500			 * to disable replay of the journal when we next remount
6501			 */
6502			sb->s_flags |= SB_RDONLY;
6503
6504			/*
6505			 * OK, test if we are remounting a valid rw partition
6506			 * readonly, and if so set the rdonly flag and then
6507			 * mark the partition as valid again.
6508			 */
6509			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
6510			    (sbi->s_mount_state & EXT4_VALID_FS))
6511				es->s_state = cpu_to_le16(sbi->s_mount_state);
6512
6513			if (sbi->s_journal) {
6514				/*
6515				 * We let remount-ro finish even if marking fs
6516				 * as clean failed...
6517				 */
6518				ext4_mark_recovery_complete(sb, es);
6519			}
6520		} else {
6521			/* Make sure we can mount this feature set readwrite */
6522			if (ext4_has_feature_readonly(sb) ||
6523			    !ext4_feature_set_ok(sb, 0)) {
6524				err = -EROFS;
6525				goto restore_opts;
6526			}
6527			/*
6528			 * Make sure the group descriptor checksums
6529			 * are sane.  If they aren't, refuse to remount r/w.
6530			 */
6531			for (g = 0; g < sbi->s_groups_count; g++) {
6532				struct ext4_group_desc *gdp =
6533					ext4_get_group_desc(sb, g, NULL);
6534
6535				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
6536					ext4_msg(sb, KERN_ERR,
6537	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
6538		g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
6539					       le16_to_cpu(gdp->bg_checksum));
6540					err = -EFSBADCRC;
6541					goto restore_opts;
6542				}
6543			}
6544
6545			/*
6546			 * If we have an unprocessed orphan list hanging
6547			 * around from a previously readonly bdev mount,
6548			 * require a full umount/remount for now.
6549			 */
6550			if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
6551				ext4_msg(sb, KERN_WARNING, "Couldn't "
6552				       "remount RDWR because of unprocessed "
6553				       "orphan inode list.  Please "
6554				       "umount/remount instead");
6555				err = -EINVAL;
6556				goto restore_opts;
6557			}
6558
6559			/*
6560			 * Mounting a RDONLY partition read-write, so reread
6561			 * and store the current valid flag.  (It may have
6562			 * been changed by e2fsck since we originally mounted
6563			 * the partition.)
6564			 */
6565			if (sbi->s_journal) {
6566				err = ext4_clear_journal_err(sb, es);
6567				if (err)
6568					goto restore_opts;
6569			}
6570			sbi->s_mount_state = (le16_to_cpu(es->s_state) &
6571					      ~EXT4_FC_REPLAY);
6572
6573			err = ext4_setup_super(sb, es, 0);
6574			if (err)
6575				goto restore_opts;
6576
6577			sb->s_flags &= ~SB_RDONLY;
6578			if (ext4_has_feature_mmp(sb)) {
6579				err = ext4_multi_mount_protect(sb,
6580						le64_to_cpu(es->s_mmp_block));
6581				if (err)
6582					goto restore_opts;
6583			}
6584#ifdef CONFIG_QUOTA
6585			enable_quota = 1;
6586#endif
6587		}
6588	}
6589
6590	/*
6591	 * Handle creation of system zone data early because it can fail.
6592	 * Releasing of existing data is done when we are sure remount will
6593	 * succeed.
6594	 */
6595	if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
6596		err = ext4_setup_system_zone(sb);
6597		if (err)
6598			goto restore_opts;
6599	}
6600
6601	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
6602		err = ext4_commit_super(sb);
6603		if (err)
6604			goto restore_opts;
6605	}
6606
6607#ifdef CONFIG_QUOTA
6608	if (enable_quota) {
6609		if (sb_any_quota_suspended(sb))
6610			dquot_resume(sb, -1);
6611		else if (ext4_has_feature_quota(sb)) {
6612			err = ext4_enable_quotas(sb);
6613			if (err)
6614				goto restore_opts;
6615		}
6616	}
6617	/* Release old quota file names */
6618	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6619		kfree(old_opts.s_qf_names[i]);
6620#endif
6621	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6622		ext4_release_system_zone(sb);
6623
6624	/*
6625	 * Reinitialize lazy itable initialization thread based on
6626	 * current settings
6627	 */
6628	if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
6629		ext4_unregister_li_request(sb);
6630	else {
6631		ext4_group_t first_not_zeroed;
6632		first_not_zeroed = ext4_has_uninit_itable(sb);
6633		ext4_register_li_request(sb, first_not_zeroed);
6634	}
6635
6636	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6637		ext4_stop_mmpd(sbi);
6638
6639	return 0;
6640
6641restore_opts:
6642	/*
6643	 * If there was a failing r/w to ro transition, we may need to
6644	 * re-enable quota
6645	 */
6646	if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
6647	    sb_any_quota_suspended(sb))
6648		dquot_resume(sb, -1);
6649	sb->s_flags = old_sb_flags;
6650	sbi->s_mount_opt = old_opts.s_mount_opt;
6651	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
6652	sbi->s_resuid = old_opts.s_resuid;
6653	sbi->s_resgid = old_opts.s_resgid;
6654	sbi->s_commit_interval = old_opts.s_commit_interval;
6655	sbi->s_min_batch_time = old_opts.s_min_batch_time;
6656	sbi->s_max_batch_time = old_opts.s_max_batch_time;
6657	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6658		ext4_release_system_zone(sb);
6659#ifdef CONFIG_QUOTA
6660	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
6661	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
6662		to_free[i] = get_qf_name(sb, sbi, i);
6663		rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
6664	}
6665	synchronize_rcu();
6666	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6667		kfree(to_free[i]);
6668#endif
6669	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6670		ext4_stop_mmpd(sbi);
6671	return err;
6672}
6673
6674static int ext4_reconfigure(struct fs_context *fc)
6675{
6676	struct super_block *sb = fc->root->d_sb;
6677	int ret;
6678
6679	fc->s_fs_info = EXT4_SB(sb);
6680
6681	ret = ext4_check_opt_consistency(fc, sb);
6682	if (ret < 0)
6683		return ret;
6684
6685	ret = __ext4_remount(fc, sb);
6686	if (ret < 0)
6687		return ret;
6688
6689	ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
6690		 &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
6691		 ext4_quota_mode(sb));
6692
6693	return 0;
6694}
6695
6696#ifdef CONFIG_QUOTA
6697static int ext4_statfs_project(struct super_block *sb,
6698			       kprojid_t projid, struct kstatfs *buf)
6699{
6700	struct kqid qid;
6701	struct dquot *dquot;
6702	u64 limit;
6703	u64 curblock;
6704
6705	qid = make_kqid_projid(projid);
6706	dquot = dqget(sb, qid);
6707	if (IS_ERR(dquot))
6708		return PTR_ERR(dquot);
6709	spin_lock(&dquot->dq_dqb_lock);
6710
6711	limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
6712			     dquot->dq_dqb.dqb_bhardlimit);
6713	limit >>= sb->s_blocksize_bits;
6714
6715	if (limit && buf->f_blocks > limit) {
6716		curblock = (dquot->dq_dqb.dqb_curspace +
6717			    dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
6718		buf->f_blocks = limit;
6719		buf->f_bfree = buf->f_bavail =
6720			(buf->f_blocks > curblock) ?
6721			 (buf->f_blocks - curblock) : 0;
6722	}
6723
6724	limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
6725			     dquot->dq_dqb.dqb_ihardlimit);
6726	if (limit && buf->f_files > limit) {
6727		buf->f_files = limit;
6728		buf->f_ffree =
6729			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
6730			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
6731	}
6732
6733	spin_unlock(&dquot->dq_dqb_lock);
6734	dqput(dquot);
6735	return 0;
6736}
6737#endif
6738
6739static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
6740{
6741	struct super_block *sb = dentry->d_sb;
6742	struct ext4_sb_info *sbi = EXT4_SB(sb);
6743	struct ext4_super_block *es = sbi->s_es;
6744	ext4_fsblk_t overhead = 0, resv_blocks;
6745	s64 bfree;
6746	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
6747
6748	if (!test_opt(sb, MINIX_DF))
6749		overhead = sbi->s_overhead;
6750
6751	buf->f_type = EXT4_SUPER_MAGIC;
6752	buf->f_bsize = sb->s_blocksize;
6753	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
6754	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
6755		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
6756	/* prevent underflow in case that few free space is available */
6757	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
6758	buf->f_bavail = buf->f_bfree -
6759			(ext4_r_blocks_count(es) + resv_blocks);
6760	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
6761		buf->f_bavail = 0;
6762	buf->f_files = le32_to_cpu(es->s_inodes_count);
6763	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
6764	buf->f_namelen = EXT4_NAME_LEN;
6765	buf->f_fsid = uuid_to_fsid(es->s_uuid);
6766
6767#ifdef CONFIG_QUOTA
6768	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
6769	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
6770		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
6771#endif
6772	return 0;
6773}
6774
6775
6776#ifdef CONFIG_QUOTA
6777
6778/*
6779 * Helper functions so that transaction is started before we acquire dqio_sem
6780 * to keep correct lock ordering of transaction > dqio_sem
6781 */
6782static inline struct inode *dquot_to_inode(struct dquot *dquot)
6783{
6784	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
6785}
6786
6787static int ext4_write_dquot(struct dquot *dquot)
6788{
6789	int ret, err;
6790	handle_t *handle;
6791	struct inode *inode;
6792
6793	inode = dquot_to_inode(dquot);
6794	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
6795				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
6796	if (IS_ERR(handle))
6797		return PTR_ERR(handle);
6798	ret = dquot_commit(dquot);
6799	err = ext4_journal_stop(handle);
6800	if (!ret)
6801		ret = err;
6802	return ret;
6803}
6804
6805static int ext4_acquire_dquot(struct dquot *dquot)
6806{
6807	int ret, err;
6808	handle_t *handle;
6809
6810	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6811				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
6812	if (IS_ERR(handle))
6813		return PTR_ERR(handle);
6814	ret = dquot_acquire(dquot);
6815	err = ext4_journal_stop(handle);
6816	if (!ret)
6817		ret = err;
6818	return ret;
6819}
6820
6821static int ext4_release_dquot(struct dquot *dquot)
6822{
6823	int ret, err;
6824	handle_t *handle;
6825
6826	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6827				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
6828	if (IS_ERR(handle)) {
6829		/* Release dquot anyway to avoid endless cycle in dqput() */
6830		dquot_release(dquot);
6831		return PTR_ERR(handle);
6832	}
6833	ret = dquot_release(dquot);
6834	err = ext4_journal_stop(handle);
6835	if (!ret)
6836		ret = err;
6837	return ret;
6838}
6839
6840static int ext4_mark_dquot_dirty(struct dquot *dquot)
6841{
6842	struct super_block *sb = dquot->dq_sb;
6843
6844	if (ext4_is_quota_journalled(sb)) {
6845		dquot_mark_dquot_dirty(dquot);
6846		return ext4_write_dquot(dquot);
6847	} else {
6848		return dquot_mark_dquot_dirty(dquot);
6849	}
6850}
6851
6852static int ext4_write_info(struct super_block *sb, int type)
6853{
6854	int ret, err;
6855	handle_t *handle;
6856
6857	/* Data block + inode block */
6858	handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
6859	if (IS_ERR(handle))
6860		return PTR_ERR(handle);
6861	ret = dquot_commit_info(sb, type);
6862	err = ext4_journal_stop(handle);
6863	if (!ret)
6864		ret = err;
6865	return ret;
6866}
6867
6868static void lockdep_set_quota_inode(struct inode *inode, int subclass)
6869{
6870	struct ext4_inode_info *ei = EXT4_I(inode);
6871
6872	/* The first argument of lockdep_set_subclass has to be
6873	 * *exactly* the same as the argument to init_rwsem() --- in
6874	 * this case, in init_once() --- or lockdep gets unhappy
6875	 * because the name of the lock is set using the
6876	 * stringification of the argument to init_rwsem().
6877	 */
6878	(void) ei;	/* shut up clang warning if !CONFIG_LOCKDEP */
6879	lockdep_set_subclass(&ei->i_data_sem, subclass);
6880}
6881
6882/*
6883 * Standard function to be called on quota_on
6884 */
6885static int ext4_quota_on(struct super_block *sb, int type, int format_id,
6886			 const struct path *path)
6887{
6888	int err;
6889
6890	if (!test_opt(sb, QUOTA))
6891		return -EINVAL;
6892
6893	/* Quotafile not on the same filesystem? */
6894	if (path->dentry->d_sb != sb)
6895		return -EXDEV;
6896
6897	/* Quota already enabled for this file? */
6898	if (IS_NOQUOTA(d_inode(path->dentry)))
6899		return -EBUSY;
6900
6901	/* Journaling quota? */
6902	if (EXT4_SB(sb)->s_qf_names[type]) {
6903		/* Quotafile not in fs root? */
6904		if (path->dentry->d_parent != sb->s_root)
6905			ext4_msg(sb, KERN_WARNING,
6906				"Quota file not on filesystem root. "
6907				"Journaled quota will not work");
6908		sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
6909	} else {
6910		/*
6911		 * Clear the flag just in case mount options changed since
6912		 * last time.
6913		 */
6914		sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
6915	}
6916
6917	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
6918	err = dquot_quota_on(sb, type, format_id, path);
6919	if (!err) {
6920		struct inode *inode = d_inode(path->dentry);
6921		handle_t *handle;
6922
6923		/*
6924		 * Set inode flags to prevent userspace from messing with quota
6925		 * files. If this fails, we return success anyway since quotas
6926		 * are already enabled and this is not a hard failure.
6927		 */
6928		inode_lock(inode);
6929		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6930		if (IS_ERR(handle))
6931			goto unlock_inode;
6932		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
6933		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
6934				S_NOATIME | S_IMMUTABLE);
6935		err = ext4_mark_inode_dirty(handle, inode);
6936		ext4_journal_stop(handle);
6937	unlock_inode:
6938		inode_unlock(inode);
6939		if (err)
6940			dquot_quota_off(sb, type);
6941	}
6942	if (err)
6943		lockdep_set_quota_inode(path->dentry->d_inode,
6944					     I_DATA_SEM_NORMAL);
6945	return err;
6946}
6947
6948static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
6949{
6950	switch (type) {
6951	case USRQUOTA:
6952		return qf_inum == EXT4_USR_QUOTA_INO;
6953	case GRPQUOTA:
6954		return qf_inum == EXT4_GRP_QUOTA_INO;
6955	case PRJQUOTA:
6956		return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
6957	default:
6958		BUG();
6959	}
6960}
6961
6962static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
6963			     unsigned int flags)
6964{
6965	int err;
6966	struct inode *qf_inode;
6967	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
6968		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
6969		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
6970		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
6971	};
6972
6973	BUG_ON(!ext4_has_feature_quota(sb));
6974
6975	if (!qf_inums[type])
6976		return -EPERM;
6977
6978	if (!ext4_check_quota_inum(type, qf_inums[type])) {
6979		ext4_error(sb, "Bad quota inum: %lu, type: %d",
6980				qf_inums[type], type);
6981		return -EUCLEAN;
6982	}
6983
6984	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
6985	if (IS_ERR(qf_inode)) {
6986		ext4_error(sb, "Bad quota inode: %lu, type: %d",
6987				qf_inums[type], type);
6988		return PTR_ERR(qf_inode);
6989	}
6990
6991	/* Don't account quota for quota files to avoid recursion */
6992	qf_inode->i_flags |= S_NOQUOTA;
6993	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
6994	err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
6995	if (err)
6996		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
6997	iput(qf_inode);
6998
6999	return err;
7000}
7001
7002/* Enable usage tracking for all quota types. */
7003int ext4_enable_quotas(struct super_block *sb)
7004{
7005	int type, err = 0;
7006	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
7007		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
7008		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
7009		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
7010	};
7011	bool quota_mopt[EXT4_MAXQUOTAS] = {
7012		test_opt(sb, USRQUOTA),
7013		test_opt(sb, GRPQUOTA),
7014		test_opt(sb, PRJQUOTA),
7015	};
7016
7017	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
7018	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
7019		if (qf_inums[type]) {
7020			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
7021				DQUOT_USAGE_ENABLED |
7022				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
7023			if (err) {
7024				ext4_warning(sb,
7025					"Failed to enable quota tracking "
7026					"(type=%d, err=%d, ino=%lu). "
7027					"Please run e2fsck to fix.", type,
7028					err, qf_inums[type]);
7029				for (type--; type >= 0; type--) {
7030					struct inode *inode;
7031
7032					inode = sb_dqopt(sb)->files[type];
7033					if (inode)
7034						inode = igrab(inode);
7035					dquot_quota_off(sb, type);
7036					if (inode) {
7037						lockdep_set_quota_inode(inode,
7038							I_DATA_SEM_NORMAL);
7039						iput(inode);
7040					}
7041				}
7042
7043				return err;
7044			}
7045		}
7046	}
7047	return 0;
7048}
7049
7050static int ext4_quota_off(struct super_block *sb, int type)
7051{
7052	struct inode *inode = sb_dqopt(sb)->files[type];
7053	handle_t *handle;
7054	int err;
7055
7056	/* Force all delayed allocation blocks to be allocated.
7057	 * Caller already holds s_umount sem */
7058	if (test_opt(sb, DELALLOC))
7059		sync_filesystem(sb);
7060
7061	if (!inode || !igrab(inode))
7062		goto out;
7063
7064	err = dquot_quota_off(sb, type);
7065	if (err || ext4_has_feature_quota(sb))
7066		goto out_put;
7067
7068	inode_lock(inode);
7069	/*
7070	 * Update modification times of quota files when userspace can
7071	 * start looking at them. If we fail, we return success anyway since
7072	 * this is not a hard failure and quotas are already disabled.
7073	 */
7074	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
7075	if (IS_ERR(handle)) {
7076		err = PTR_ERR(handle);
7077		goto out_unlock;
7078	}
7079	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
7080	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
7081	inode->i_mtime = inode->i_ctime = current_time(inode);
7082	err = ext4_mark_inode_dirty(handle, inode);
7083	ext4_journal_stop(handle);
7084out_unlock:
7085	inode_unlock(inode);
7086out_put:
7087	lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
7088	iput(inode);
7089	return err;
7090out:
7091	return dquot_quota_off(sb, type);
7092}
7093
7094/* Read data from quotafile - avoid pagecache and such because we cannot afford
7095 * acquiring the locks... As quota files are never truncated and quota code
7096 * itself serializes the operations (and no one else should touch the files)
7097 * we don't have to be afraid of races */
7098static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
7099			       size_t len, loff_t off)
7100{
7101	struct inode *inode = sb_dqopt(sb)->files[type];
7102	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7103	int offset = off & (sb->s_blocksize - 1);
7104	int tocopy;
7105	size_t toread;
7106	struct buffer_head *bh;
7107	loff_t i_size = i_size_read(inode);
7108
7109	if (off > i_size)
7110		return 0;
7111	if (off+len > i_size)
7112		len = i_size-off;
7113	toread = len;
7114	while (toread > 0) {
7115		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
7116		bh = ext4_bread(NULL, inode, blk, 0);
7117		if (IS_ERR(bh))
7118			return PTR_ERR(bh);
7119		if (!bh)	/* A hole? */
7120			memset(data, 0, tocopy);
7121		else
7122			memcpy(data, bh->b_data+offset, tocopy);
7123		brelse(bh);
7124		offset = 0;
7125		toread -= tocopy;
7126		data += tocopy;
7127		blk++;
7128	}
7129	return len;
7130}
7131
7132/* Write to quotafile (we know the transaction is already started and has
7133 * enough credits) */
7134static ssize_t ext4_quota_write(struct super_block *sb, int type,
7135				const char *data, size_t len, loff_t off)
7136{
7137	struct inode *inode = sb_dqopt(sb)->files[type];
7138	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7139	int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
7140	int retries = 0;
7141	struct buffer_head *bh;
7142	handle_t *handle = journal_current_handle();
7143
7144	if (!handle) {
7145		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7146			" cancelled because transaction is not started",
7147			(unsigned long long)off, (unsigned long long)len);
7148		return -EIO;
7149	}
7150	/*
7151	 * Since we account only one data block in transaction credits,
7152	 * then it is impossible to cross a block boundary.
7153	 */
7154	if (sb->s_blocksize - offset < len) {
7155		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7156			" cancelled because not block aligned",
7157			(unsigned long long)off, (unsigned long long)len);
7158		return -EIO;
7159	}
7160
7161	do {
7162		bh = ext4_bread(handle, inode, blk,
7163				EXT4_GET_BLOCKS_CREATE |
7164				EXT4_GET_BLOCKS_METADATA_NOFAIL);
7165	} while (PTR_ERR(bh) == -ENOSPC &&
7166		 ext4_should_retry_alloc(inode->i_sb, &retries));
7167	if (IS_ERR(bh))
7168		return PTR_ERR(bh);
7169	if (!bh)
7170		goto out;
7171	BUFFER_TRACE(bh, "get write access");
7172	err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
7173	if (err) {
7174		brelse(bh);
7175		return err;
7176	}
7177	lock_buffer(bh);
7178	memcpy(bh->b_data+offset, data, len);
7179	flush_dcache_page(bh->b_page);
7180	unlock_buffer(bh);
7181	err = ext4_handle_dirty_metadata(handle, NULL, bh);
7182	brelse(bh);
7183out:
7184	if (inode->i_size < off + len) {
7185		i_size_write(inode, off + len);
7186		EXT4_I(inode)->i_disksize = inode->i_size;
7187		err2 = ext4_mark_inode_dirty(handle, inode);
7188		if (unlikely(err2 && !err))
7189			err = err2;
7190	}
7191	return err ? err : len;
7192}
7193#endif
7194
7195#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
7196static inline void register_as_ext2(void)
7197{
7198	int err = register_filesystem(&ext2_fs_type);
7199	if (err)
7200		printk(KERN_WARNING
7201		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
7202}
7203
7204static inline void unregister_as_ext2(void)
7205{
7206	unregister_filesystem(&ext2_fs_type);
7207}
7208
7209static inline int ext2_feature_set_ok(struct super_block *sb)
7210{
7211	if (ext4_has_unknown_ext2_incompat_features(sb))
7212		return 0;
7213	if (sb_rdonly(sb))
7214		return 1;
7215	if (ext4_has_unknown_ext2_ro_compat_features(sb))
7216		return 0;
7217	return 1;
7218}
7219#else
7220static inline void register_as_ext2(void) { }
7221static inline void unregister_as_ext2(void) { }
7222static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
7223#endif
7224
7225static inline void register_as_ext3(void)
7226{
7227	int err = register_filesystem(&ext3_fs_type);
7228	if (err)
7229		printk(KERN_WARNING
7230		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
7231}
7232
7233static inline void unregister_as_ext3(void)
7234{
7235	unregister_filesystem(&ext3_fs_type);
7236}
7237
7238static inline int ext3_feature_set_ok(struct super_block *sb)
7239{
7240	if (ext4_has_unknown_ext3_incompat_features(sb))
7241		return 0;
7242	if (!ext4_has_feature_journal(sb))
7243		return 0;
7244	if (sb_rdonly(sb))
7245		return 1;
7246	if (ext4_has_unknown_ext3_ro_compat_features(sb))
7247		return 0;
7248	return 1;
7249}
7250
7251static struct file_system_type ext4_fs_type = {
7252	.owner			= THIS_MODULE,
7253	.name			= "ext4",
7254	.init_fs_context	= ext4_init_fs_context,
7255	.parameters		= ext4_param_specs,
7256	.kill_sb		= kill_block_super,
7257	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
7258};
7259MODULE_ALIAS_FS("ext4");
7260
7261/* Shared across all ext4 file systems */
7262wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
7263
7264static int __init ext4_init_fs(void)
7265{
7266	int i, err;
7267
7268	ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
7269	ext4_li_info = NULL;
7270
7271	/* Build-time check for flags consistency */
7272	ext4_check_flag_values();
7273
7274	for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
7275		init_waitqueue_head(&ext4__ioend_wq[i]);
7276
7277	err = ext4_init_es();
7278	if (err)
7279		return err;
7280
7281	err = ext4_init_pending();
7282	if (err)
7283		goto out7;
7284
7285	err = ext4_init_post_read_processing();
7286	if (err)
7287		goto out6;
7288
7289	err = ext4_init_pageio();
7290	if (err)
7291		goto out5;
7292
7293	err = ext4_init_system_zone();
7294	if (err)
7295		goto out4;
7296
7297	err = ext4_init_sysfs();
7298	if (err)
7299		goto out3;
7300
7301	err = ext4_init_mballoc();
7302	if (err)
7303		goto out2;
7304	err = init_inodecache();
7305	if (err)
7306		goto out1;
7307
7308	err = ext4_fc_init_dentry_cache();
7309	if (err)
7310		goto out05;
7311
7312	register_as_ext3();
7313	register_as_ext2();
7314	err = register_filesystem(&ext4_fs_type);
7315	if (err)
7316		goto out;
7317
7318	return 0;
7319out:
7320	unregister_as_ext2();
7321	unregister_as_ext3();
7322	ext4_fc_destroy_dentry_cache();
7323out05:
7324	destroy_inodecache();
7325out1:
7326	ext4_exit_mballoc();
7327out2:
7328	ext4_exit_sysfs();
7329out3:
7330	ext4_exit_system_zone();
7331out4:
7332	ext4_exit_pageio();
7333out5:
7334	ext4_exit_post_read_processing();
7335out6:
7336	ext4_exit_pending();
7337out7:
7338	ext4_exit_es();
7339
7340	return err;
7341}
7342
7343static void __exit ext4_exit_fs(void)
7344{
7345	ext4_destroy_lazyinit_thread();
7346	unregister_as_ext2();
7347	unregister_as_ext3();
7348	unregister_filesystem(&ext4_fs_type);
7349	ext4_fc_destroy_dentry_cache();
7350	destroy_inodecache();
7351	ext4_exit_mballoc();
7352	ext4_exit_sysfs();
7353	ext4_exit_system_zone();
7354	ext4_exit_pageio();
7355	ext4_exit_post_read_processing();
7356	ext4_exit_es();
7357	ext4_exit_pending();
7358}
7359
7360MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
7361MODULE_DESCRIPTION("Fourth Extended Filesystem");
7362MODULE_LICENSE("GPL");
7363MODULE_SOFTDEP("pre: crc32c");
7364module_init(ext4_init_fs)
7365module_exit(ext4_exit_fs)