fs/ext4/super.c at v5.15-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / ext4 / super.c
at v5.15-rc1 6676 lines 193 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/ext4/super.c
   4 *
   5 * Copyright (C) 1992, 1993, 1994, 1995
   6 * Remy Card (card@masi.ibp.fr)
   7 * Laboratoire MASI - Institut Blaise Pascal
   8 * Universite Pierre et Marie Curie (Paris VI)
   9 *
  10 *  from
  11 *
  12 *  linux/fs/minix/inode.c
  13 *
  14 *  Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 *  Big-endian to little-endian byte-swapping/bitmaps by
  17 *        David S. Miller (davem@caip.rutgers.edu), 1995
  18 */
  19
  20#include <linux/module.h>
  21#include <linux/string.h>
  22#include <linux/fs.h>
  23#include <linux/time.h>
  24#include <linux/vmalloc.h>
  25#include <linux/slab.h>
  26#include <linux/init.h>
  27#include <linux/blkdev.h>
  28#include <linux/backing-dev.h>
  29#include <linux/parser.h>
  30#include <linux/buffer_head.h>
  31#include <linux/exportfs.h>
  32#include <linux/vfs.h>
  33#include <linux/random.h>
  34#include <linux/mount.h>
  35#include <linux/namei.h>
  36#include <linux/quotaops.h>
  37#include <linux/seq_file.h>
  38#include <linux/ctype.h>
  39#include <linux/log2.h>
  40#include <linux/crc16.h>
  41#include <linux/dax.h>
  42#include <linux/cleancache.h>
  43#include <linux/uaccess.h>
  44#include <linux/iversion.h>
  45#include <linux/unicode.h>
  46#include <linux/part_stat.h>
  47#include <linux/kthread.h>
  48#include <linux/freezer.h>
  49
  50#include "ext4.h"
  51#include "ext4_extents.h"	/* Needed for trace points definition */
  52#include "ext4_jbd2.h"
  53#include "xattr.h"
  54#include "acl.h"
  55#include "mballoc.h"
  56#include "fsmap.h"
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/ext4.h>
  60
  61static struct ext4_lazy_init *ext4_li_info;
  62static DEFINE_MUTEX(ext4_li_mtx);
  63static struct ratelimit_state ext4_mount_msg_ratelimit;
  64
  65static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  66			     unsigned long journal_devnum);
  67static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  68static void ext4_update_super(struct super_block *sb);
  69static int ext4_commit_super(struct super_block *sb);
  70static int ext4_mark_recovery_complete(struct super_block *sb,
  71					struct ext4_super_block *es);
  72static int ext4_clear_journal_err(struct super_block *sb,
  73				  struct ext4_super_block *es);
  74static int ext4_sync_fs(struct super_block *sb, int wait);
  75static int ext4_remount(struct super_block *sb, int *flags, char *data);
  76static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  77static int ext4_unfreeze(struct super_block *sb);
  78static int ext4_freeze(struct super_block *sb);
  79static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  80		       const char *dev_name, void *data);
  81static inline int ext2_feature_set_ok(struct super_block *sb);
  82static inline int ext3_feature_set_ok(struct super_block *sb);
  83static void ext4_destroy_lazyinit_thread(void);
  84static void ext4_unregister_li_request(struct super_block *sb);
  85static void ext4_clear_request_list(void);
  86static struct inode *ext4_get_journal_inode(struct super_block *sb,
  87					    unsigned int journal_inum);
  88
  89/*
  90 * Lock ordering
  91 *
  92 * page fault path:
  93 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
  94 *   -> page lock -> i_data_sem (rw)
  95 *
  96 * buffered write path:
  97 * sb_start_write -> i_mutex -> mmap_lock
  98 * sb_start_write -> i_mutex -> transaction start -> page lock ->
  99 *   i_data_sem (rw)
 100 *
 101 * truncate:
 102 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
 103 *   page lock
 104 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
 105 *   i_data_sem (rw)
 106 *
 107 * direct IO:
 108 * sb_start_write -> i_mutex -> mmap_lock
 109 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 110 *
 111 * writepages:
 112 * transaction start -> page lock(s) -> i_data_sem (rw)
 113 */
 114
 115#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 116static struct file_system_type ext2_fs_type = {
 117	.owner		= THIS_MODULE,
 118	.name		= "ext2",
 119	.mount		= ext4_mount,
 120	.kill_sb	= kill_block_super,
 121	.fs_flags	= FS_REQUIRES_DEV,
 122};
 123MODULE_ALIAS_FS("ext2");
 124MODULE_ALIAS("ext2");
 125#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 126#else
 127#define IS_EXT2_SB(sb) (0)
 128#endif
 129
 130
 131static struct file_system_type ext3_fs_type = {
 132	.owner		= THIS_MODULE,
 133	.name		= "ext3",
 134	.mount		= ext4_mount,
 135	.kill_sb	= kill_block_super,
 136	.fs_flags	= FS_REQUIRES_DEV,
 137};
 138MODULE_ALIAS_FS("ext3");
 139MODULE_ALIAS("ext3");
 140#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 141
 142
 143static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
 144				  bh_end_io_t *end_io)
 145{
 146	/*
 147	 * buffer's verified bit is no longer valid after reading from
 148	 * disk again due to write out error, clear it to make sure we
 149	 * recheck the buffer contents.
 150	 */
 151	clear_buffer_verified(bh);
 152
 153	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
 154	get_bh(bh);
 155	submit_bh(REQ_OP_READ, op_flags, bh);
 156}
 157
 158void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
 159			 bh_end_io_t *end_io)
 160{
 161	BUG_ON(!buffer_locked(bh));
 162
 163	if (ext4_buffer_uptodate(bh)) {
 164		unlock_buffer(bh);
 165		return;
 166	}
 167	__ext4_read_bh(bh, op_flags, end_io);
 168}
 169
 170int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
 171{
 172	BUG_ON(!buffer_locked(bh));
 173
 174	if (ext4_buffer_uptodate(bh)) {
 175		unlock_buffer(bh);
 176		return 0;
 177	}
 178
 179	__ext4_read_bh(bh, op_flags, end_io);
 180
 181	wait_on_buffer(bh);
 182	if (buffer_uptodate(bh))
 183		return 0;
 184	return -EIO;
 185}
 186
 187int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
 188{
 189	if (trylock_buffer(bh)) {
 190		if (wait)
 191			return ext4_read_bh(bh, op_flags, NULL);
 192		ext4_read_bh_nowait(bh, op_flags, NULL);
 193		return 0;
 194	}
 195	if (wait) {
 196		wait_on_buffer(bh);
 197		if (buffer_uptodate(bh))
 198			return 0;
 199		return -EIO;
 200	}
 201	return 0;
 202}
 203
 204/*
 205 * This works like __bread_gfp() except it uses ERR_PTR for error
 206 * returns.  Currently with sb_bread it's impossible to distinguish
 207 * between ENOMEM and EIO situations (since both result in a NULL
 208 * return.
 209 */
 210static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 211					       sector_t block, int op_flags,
 212					       gfp_t gfp)
 213{
 214	struct buffer_head *bh;
 215	int ret;
 216
 217	bh = sb_getblk_gfp(sb, block, gfp);
 218	if (bh == NULL)
 219		return ERR_PTR(-ENOMEM);
 220	if (ext4_buffer_uptodate(bh))
 221		return bh;
 222
 223	ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
 224	if (ret) {
 225		put_bh(bh);
 226		return ERR_PTR(ret);
 227	}
 228	return bh;
 229}
 230
 231struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 232				   int op_flags)
 233{
 234	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
 235}
 236
 237struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 238					    sector_t block)
 239{
 240	return __ext4_sb_bread_gfp(sb, block, 0, 0);
 241}
 242
 243void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
 244{
 245	struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
 246
 247	if (likely(bh)) {
 248		ext4_read_bh_lock(bh, REQ_RAHEAD, false);
 249		brelse(bh);
 250	}
 251}
 252
 253static int ext4_verify_csum_type(struct super_block *sb,
 254				 struct ext4_super_block *es)
 255{
 256	if (!ext4_has_feature_metadata_csum(sb))
 257		return 1;
 258
 259	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 260}
 261
 262static __le32 ext4_superblock_csum(struct super_block *sb,
 263				   struct ext4_super_block *es)
 264{
 265	struct ext4_sb_info *sbi = EXT4_SB(sb);
 266	int offset = offsetof(struct ext4_super_block, s_checksum);
 267	__u32 csum;
 268
 269	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 270
 271	return cpu_to_le32(csum);
 272}
 273
 274static int ext4_superblock_csum_verify(struct super_block *sb,
 275				       struct ext4_super_block *es)
 276{
 277	if (!ext4_has_metadata_csum(sb))
 278		return 1;
 279
 280	return es->s_checksum == ext4_superblock_csum(sb, es);
 281}
 282
 283void ext4_superblock_csum_set(struct super_block *sb)
 284{
 285	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 286
 287	if (!ext4_has_metadata_csum(sb))
 288		return;
 289
 290	es->s_checksum = ext4_superblock_csum(sb, es);
 291}
 292
 293ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 294			       struct ext4_group_desc *bg)
 295{
 296	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 297		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 298		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 299}
 300
 301ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 302			       struct ext4_group_desc *bg)
 303{
 304	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 305		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 306		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 307}
 308
 309ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 310			      struct ext4_group_desc *bg)
 311{
 312	return le32_to_cpu(bg->bg_inode_table_lo) |
 313		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 314		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 315}
 316
 317__u32 ext4_free_group_clusters(struct super_block *sb,
 318			       struct ext4_group_desc *bg)
 319{
 320	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 321		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 322		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 323}
 324
 325__u32 ext4_free_inodes_count(struct super_block *sb,
 326			      struct ext4_group_desc *bg)
 327{
 328	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 329		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 330		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 331}
 332
 333__u32 ext4_used_dirs_count(struct super_block *sb,
 334			      struct ext4_group_desc *bg)
 335{
 336	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 337		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 338		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 339}
 340
 341__u32 ext4_itable_unused_count(struct super_block *sb,
 342			      struct ext4_group_desc *bg)
 343{
 344	return le16_to_cpu(bg->bg_itable_unused_lo) |
 345		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 346		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 347}
 348
 349void ext4_block_bitmap_set(struct super_block *sb,
 350			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 351{
 352	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 353	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 354		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 355}
 356
 357void ext4_inode_bitmap_set(struct super_block *sb,
 358			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 359{
 360	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 361	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 362		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 363}
 364
 365void ext4_inode_table_set(struct super_block *sb,
 366			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
 367{
 368	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 369	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 370		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 371}
 372
 373void ext4_free_group_clusters_set(struct super_block *sb,
 374				  struct ext4_group_desc *bg, __u32 count)
 375{
 376	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 377	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 378		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 379}
 380
 381void ext4_free_inodes_set(struct super_block *sb,
 382			  struct ext4_group_desc *bg, __u32 count)
 383{
 384	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 385	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 386		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 387}
 388
 389void ext4_used_dirs_set(struct super_block *sb,
 390			  struct ext4_group_desc *bg, __u32 count)
 391{
 392	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 393	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 394		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 395}
 396
 397void ext4_itable_unused_set(struct super_block *sb,
 398			  struct ext4_group_desc *bg, __u32 count)
 399{
 400	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 401	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 402		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 403}
 404
 405static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
 406{
 407	now = clamp_val(now, 0, (1ull << 40) - 1);
 408
 409	*lo = cpu_to_le32(lower_32_bits(now));
 410	*hi = upper_32_bits(now);
 411}
 412
 413static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 414{
 415	return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 416}
 417#define ext4_update_tstamp(es, tstamp) \
 418	__ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
 419			     ktime_get_real_seconds())
 420#define ext4_get_tstamp(es, tstamp) \
 421	__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 422
 423/*
 424 * The del_gendisk() function uninitializes the disk-specific data
 425 * structures, including the bdi structure, without telling anyone
 426 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 427 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 428 * This is a kludge to prevent these oops until we can put in a proper
 429 * hook in del_gendisk() to inform the VFS and file system layers.
 430 */
 431static int block_device_ejected(struct super_block *sb)
 432{
 433	struct inode *bd_inode = sb->s_bdev->bd_inode;
 434	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 435
 436	return bdi->dev == NULL;
 437}
 438
 439static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 440{
 441	struct super_block		*sb = journal->j_private;
 442	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 443	int				error = is_journal_aborted(journal);
 444	struct ext4_journal_cb_entry	*jce;
 445
 446	BUG_ON(txn->t_state == T_FINISHED);
 447
 448	ext4_process_freed_data(sb, txn->t_tid);
 449
 450	spin_lock(&sbi->s_md_lock);
 451	while (!list_empty(&txn->t_private_list)) {
 452		jce = list_entry(txn->t_private_list.next,
 453				 struct ext4_journal_cb_entry, jce_list);
 454		list_del_init(&jce->jce_list);
 455		spin_unlock(&sbi->s_md_lock);
 456		jce->jce_func(sb, jce, error);
 457		spin_lock(&sbi->s_md_lock);
 458	}
 459	spin_unlock(&sbi->s_md_lock);
 460}
 461
 462/*
 463 * This writepage callback for write_cache_pages()
 464 * takes care of a few cases after page cleaning.
 465 *
 466 * write_cache_pages() already checks for dirty pages
 467 * and calls clear_page_dirty_for_io(), which we want,
 468 * to write protect the pages.
 469 *
 470 * However, we may have to redirty a page (see below.)
 471 */
 472static int ext4_journalled_writepage_callback(struct page *page,
 473					      struct writeback_control *wbc,
 474					      void *data)
 475{
 476	transaction_t *transaction = (transaction_t *) data;
 477	struct buffer_head *bh, *head;
 478	struct journal_head *jh;
 479
 480	bh = head = page_buffers(page);
 481	do {
 482		/*
 483		 * We have to redirty a page in these cases:
 484		 * 1) If buffer is dirty, it means the page was dirty because it
 485		 * contains a buffer that needs checkpointing. So the dirty bit
 486		 * needs to be preserved so that checkpointing writes the buffer
 487		 * properly.
 488		 * 2) If buffer is not part of the committing transaction
 489		 * (we may have just accidentally come across this buffer because
 490		 * inode range tracking is not exact) or if the currently running
 491		 * transaction already contains this buffer as well, dirty bit
 492		 * needs to be preserved so that the buffer gets writeprotected
 493		 * properly on running transaction's commit.
 494		 */
 495		jh = bh2jh(bh);
 496		if (buffer_dirty(bh) ||
 497		    (jh && (jh->b_transaction != transaction ||
 498			    jh->b_next_transaction))) {
 499			redirty_page_for_writepage(wbc, page);
 500			goto out;
 501		}
 502	} while ((bh = bh->b_this_page) != head);
 503
 504out:
 505	return AOP_WRITEPAGE_ACTIVATE;
 506}
 507
 508static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
 509{
 510	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 511	struct writeback_control wbc = {
 512		.sync_mode =  WB_SYNC_ALL,
 513		.nr_to_write = LONG_MAX,
 514		.range_start = jinode->i_dirty_start,
 515		.range_end = jinode->i_dirty_end,
 516        };
 517
 518	return write_cache_pages(mapping, &wbc,
 519				 ext4_journalled_writepage_callback,
 520				 jinode->i_transaction);
 521}
 522
 523static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
 524{
 525	int ret;
 526
 527	if (ext4_should_journal_data(jinode->i_vfs_inode))
 528		ret = ext4_journalled_submit_inode_data_buffers(jinode);
 529	else
 530		ret = jbd2_journal_submit_inode_data_buffers(jinode);
 531
 532	return ret;
 533}
 534
 535static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
 536{
 537	int ret = 0;
 538
 539	if (!ext4_should_journal_data(jinode->i_vfs_inode))
 540		ret = jbd2_journal_finish_inode_data_buffers(jinode);
 541
 542	return ret;
 543}
 544
 545static bool system_going_down(void)
 546{
 547	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
 548		|| system_state == SYSTEM_RESTART;
 549}
 550
 551struct ext4_err_translation {
 552	int code;
 553	int errno;
 554};
 555
 556#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
 557
 558static struct ext4_err_translation err_translation[] = {
 559	EXT4_ERR_TRANSLATE(EIO),
 560	EXT4_ERR_TRANSLATE(ENOMEM),
 561	EXT4_ERR_TRANSLATE(EFSBADCRC),
 562	EXT4_ERR_TRANSLATE(EFSCORRUPTED),
 563	EXT4_ERR_TRANSLATE(ENOSPC),
 564	EXT4_ERR_TRANSLATE(ENOKEY),
 565	EXT4_ERR_TRANSLATE(EROFS),
 566	EXT4_ERR_TRANSLATE(EFBIG),
 567	EXT4_ERR_TRANSLATE(EEXIST),
 568	EXT4_ERR_TRANSLATE(ERANGE),
 569	EXT4_ERR_TRANSLATE(EOVERFLOW),
 570	EXT4_ERR_TRANSLATE(EBUSY),
 571	EXT4_ERR_TRANSLATE(ENOTDIR),
 572	EXT4_ERR_TRANSLATE(ENOTEMPTY),
 573	EXT4_ERR_TRANSLATE(ESHUTDOWN),
 574	EXT4_ERR_TRANSLATE(EFAULT),
 575};
 576
 577static int ext4_errno_to_code(int errno)
 578{
 579	int i;
 580
 581	for (i = 0; i < ARRAY_SIZE(err_translation); i++)
 582		if (err_translation[i].errno == errno)
 583			return err_translation[i].code;
 584	return EXT4_ERR_UNKNOWN;
 585}
 586
 587static void save_error_info(struct super_block *sb, int error,
 588			    __u32 ino, __u64 block,
 589			    const char *func, unsigned int line)
 590{
 591	struct ext4_sb_info *sbi = EXT4_SB(sb);
 592
 593	/* We default to EFSCORRUPTED error... */
 594	if (error == 0)
 595		error = EFSCORRUPTED;
 596
 597	spin_lock(&sbi->s_error_lock);
 598	sbi->s_add_error_count++;
 599	sbi->s_last_error_code = error;
 600	sbi->s_last_error_line = line;
 601	sbi->s_last_error_ino = ino;
 602	sbi->s_last_error_block = block;
 603	sbi->s_last_error_func = func;
 604	sbi->s_last_error_time = ktime_get_real_seconds();
 605	if (!sbi->s_first_error_time) {
 606		sbi->s_first_error_code = error;
 607		sbi->s_first_error_line = line;
 608		sbi->s_first_error_ino = ino;
 609		sbi->s_first_error_block = block;
 610		sbi->s_first_error_func = func;
 611		sbi->s_first_error_time = sbi->s_last_error_time;
 612	}
 613	spin_unlock(&sbi->s_error_lock);
 614}
 615
 616/* Deal with the reporting of failure conditions on a filesystem such as
 617 * inconsistencies detected or read IO failures.
 618 *
 619 * On ext2, we can store the error state of the filesystem in the
 620 * superblock.  That is not possible on ext4, because we may have other
 621 * write ordering constraints on the superblock which prevent us from
 622 * writing it out straight away; and given that the journal is about to
 623 * be aborted, we can't rely on the current, or future, transactions to
 624 * write out the superblock safely.
 625 *
 626 * We'll just use the jbd2_journal_abort() error code to record an error in
 627 * the journal instead.  On recovery, the journal will complain about
 628 * that error until we've noted it down and cleared it.
 629 *
 630 * If force_ro is set, we unconditionally force the filesystem into an
 631 * ABORT|READONLY state, unless the error response on the fs has been set to
 632 * panic in which case we take the easy way out and panic immediately. This is
 633 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 634 * at a critical moment in log management.
 635 */
 636static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
 637			      __u32 ino, __u64 block,
 638			      const char *func, unsigned int line)
 639{
 640	journal_t *journal = EXT4_SB(sb)->s_journal;
 641	bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
 642
 643	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 644	if (test_opt(sb, WARN_ON_ERROR))
 645		WARN_ON_ONCE(1);
 646
 647	if (!continue_fs && !sb_rdonly(sb)) {
 648		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
 649		if (journal)
 650			jbd2_journal_abort(journal, -EIO);
 651	}
 652
 653	if (!bdev_read_only(sb->s_bdev)) {
 654		save_error_info(sb, error, ino, block, func, line);
 655		/*
 656		 * In case the fs should keep running, we need to writeout
 657		 * superblock through the journal. Due to lock ordering
 658		 * constraints, it may not be safe to do it right here so we
 659		 * defer superblock flushing to a workqueue.
 660		 */
 661		if (continue_fs)
 662			schedule_work(&EXT4_SB(sb)->s_error_work);
 663		else
 664			ext4_commit_super(sb);
 665	}
 666
 667	/*
 668	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
 669	 * could panic during 'reboot -f' as the underlying device got already
 670	 * disabled.
 671	 */
 672	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 673		panic("EXT4-fs (device %s): panic forced after error\n",
 674			sb->s_id);
 675	}
 676
 677	if (sb_rdonly(sb) || continue_fs)
 678		return;
 679
 680	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 681	/*
 682	 * Make sure updated value of ->s_mount_flags will be visible before
 683	 * ->s_flags update
 684	 */
 685	smp_wmb();
 686	sb->s_flags |= SB_RDONLY;
 687}
 688
 689static void flush_stashed_error_work(struct work_struct *work)
 690{
 691	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
 692						s_error_work);
 693	journal_t *journal = sbi->s_journal;
 694	handle_t *handle;
 695
 696	/*
 697	 * If the journal is still running, we have to write out superblock
 698	 * through the journal to avoid collisions of other journalled sb
 699	 * updates.
 700	 *
 701	 * We use directly jbd2 functions here to avoid recursing back into
 702	 * ext4 error handling code during handling of previous errors.
 703	 */
 704	if (!sb_rdonly(sbi->s_sb) && journal) {
 705		struct buffer_head *sbh = sbi->s_sbh;
 706		handle = jbd2_journal_start(journal, 1);
 707		if (IS_ERR(handle))
 708			goto write_directly;
 709		if (jbd2_journal_get_write_access(handle, sbh)) {
 710			jbd2_journal_stop(handle);
 711			goto write_directly;
 712		}
 713		ext4_update_super(sbi->s_sb);
 714		if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
 715			ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
 716				 "superblock detected");
 717			clear_buffer_write_io_error(sbh);
 718			set_buffer_uptodate(sbh);
 719		}
 720
 721		if (jbd2_journal_dirty_metadata(handle, sbh)) {
 722			jbd2_journal_stop(handle);
 723			goto write_directly;
 724		}
 725		jbd2_journal_stop(handle);
 726		ext4_notify_error_sysfs(sbi);
 727		return;
 728	}
 729write_directly:
 730	/*
 731	 * Write through journal failed. Write sb directly to get error info
 732	 * out and hope for the best.
 733	 */
 734	ext4_commit_super(sbi->s_sb);
 735	ext4_notify_error_sysfs(sbi);
 736}
 737
 738#define ext4_error_ratelimit(sb)					\
 739		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
 740			     "EXT4-fs error")
 741
 742void __ext4_error(struct super_block *sb, const char *function,
 743		  unsigned int line, bool force_ro, int error, __u64 block,
 744		  const char *fmt, ...)
 745{
 746	struct va_format vaf;
 747	va_list args;
 748
 749	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 750		return;
 751
 752	trace_ext4_error(sb, function, line);
 753	if (ext4_error_ratelimit(sb)) {
 754		va_start(args, fmt);
 755		vaf.fmt = fmt;
 756		vaf.va = &args;
 757		printk(KERN_CRIT
 758		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 759		       sb->s_id, function, line, current->comm, &vaf);
 760		va_end(args);
 761	}
 762	ext4_handle_error(sb, force_ro, error, 0, block, function, line);
 763}
 764
 765void __ext4_error_inode(struct inode *inode, const char *function,
 766			unsigned int line, ext4_fsblk_t block, int error,
 767			const char *fmt, ...)
 768{
 769	va_list args;
 770	struct va_format vaf;
 771
 772	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 773		return;
 774
 775	trace_ext4_error(inode->i_sb, function, line);
 776	if (ext4_error_ratelimit(inode->i_sb)) {
 777		va_start(args, fmt);
 778		vaf.fmt = fmt;
 779		vaf.va = &args;
 780		if (block)
 781			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 782			       "inode #%lu: block %llu: comm %s: %pV\n",
 783			       inode->i_sb->s_id, function, line, inode->i_ino,
 784			       block, current->comm, &vaf);
 785		else
 786			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 787			       "inode #%lu: comm %s: %pV\n",
 788			       inode->i_sb->s_id, function, line, inode->i_ino,
 789			       current->comm, &vaf);
 790		va_end(args);
 791	}
 792	ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
 793			  function, line);
 794}
 795
 796void __ext4_error_file(struct file *file, const char *function,
 797		       unsigned int line, ext4_fsblk_t block,
 798		       const char *fmt, ...)
 799{
 800	va_list args;
 801	struct va_format vaf;
 802	struct inode *inode = file_inode(file);
 803	char pathname[80], *path;
 804
 805	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 806		return;
 807
 808	trace_ext4_error(inode->i_sb, function, line);
 809	if (ext4_error_ratelimit(inode->i_sb)) {
 810		path = file_path(file, pathname, sizeof(pathname));
 811		if (IS_ERR(path))
 812			path = "(unknown)";
 813		va_start(args, fmt);
 814		vaf.fmt = fmt;
 815		vaf.va = &args;
 816		if (block)
 817			printk(KERN_CRIT
 818			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 819			       "block %llu: comm %s: path %s: %pV\n",
 820			       inode->i_sb->s_id, function, line, inode->i_ino,
 821			       block, current->comm, path, &vaf);
 822		else
 823			printk(KERN_CRIT
 824			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 825			       "comm %s: path %s: %pV\n",
 826			       inode->i_sb->s_id, function, line, inode->i_ino,
 827			       current->comm, path, &vaf);
 828		va_end(args);
 829	}
 830	ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
 831			  function, line);
 832}
 833
 834const char *ext4_decode_error(struct super_block *sb, int errno,
 835			      char nbuf[16])
 836{
 837	char *errstr = NULL;
 838
 839	switch (errno) {
 840	case -EFSCORRUPTED:
 841		errstr = "Corrupt filesystem";
 842		break;
 843	case -EFSBADCRC:
 844		errstr = "Filesystem failed CRC";
 845		break;
 846	case -EIO:
 847		errstr = "IO failure";
 848		break;
 849	case -ENOMEM:
 850		errstr = "Out of memory";
 851		break;
 852	case -EROFS:
 853		if (!sb || (EXT4_SB(sb)->s_journal &&
 854			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 855			errstr = "Journal has aborted";
 856		else
 857			errstr = "Readonly filesystem";
 858		break;
 859	default:
 860		/* If the caller passed in an extra buffer for unknown
 861		 * errors, textualise them now.  Else we just return
 862		 * NULL. */
 863		if (nbuf) {
 864			/* Check for truncated error codes... */
 865			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 866				errstr = nbuf;
 867		}
 868		break;
 869	}
 870
 871	return errstr;
 872}
 873
 874/* __ext4_std_error decodes expected errors from journaling functions
 875 * automatically and invokes the appropriate error response.  */
 876
 877void __ext4_std_error(struct super_block *sb, const char *function,
 878		      unsigned int line, int errno)
 879{
 880	char nbuf[16];
 881	const char *errstr;
 882
 883	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 884		return;
 885
 886	/* Special case: if the error is EROFS, and we're not already
 887	 * inside a transaction, then there's really no point in logging
 888	 * an error. */
 889	if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
 890		return;
 891
 892	if (ext4_error_ratelimit(sb)) {
 893		errstr = ext4_decode_error(sb, errno, nbuf);
 894		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 895		       sb->s_id, function, line, errstr);
 896	}
 897
 898	ext4_handle_error(sb, false, -errno, 0, 0, function, line);
 899}
 900
 901void __ext4_msg(struct super_block *sb,
 902		const char *prefix, const char *fmt, ...)
 903{
 904	struct va_format vaf;
 905	va_list args;
 906
 907	atomic_inc(&EXT4_SB(sb)->s_msg_count);
 908	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 909		return;
 910
 911	va_start(args, fmt);
 912	vaf.fmt = fmt;
 913	vaf.va = &args;
 914	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 915	va_end(args);
 916}
 917
 918static int ext4_warning_ratelimit(struct super_block *sb)
 919{
 920	atomic_inc(&EXT4_SB(sb)->s_warning_count);
 921	return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 922			    "EXT4-fs warning");
 923}
 924
 925void __ext4_warning(struct super_block *sb, const char *function,
 926		    unsigned int line, const char *fmt, ...)
 927{
 928	struct va_format vaf;
 929	va_list args;
 930
 931	if (!ext4_warning_ratelimit(sb))
 932		return;
 933
 934	va_start(args, fmt);
 935	vaf.fmt = fmt;
 936	vaf.va = &args;
 937	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 938	       sb->s_id, function, line, &vaf);
 939	va_end(args);
 940}
 941
 942void __ext4_warning_inode(const struct inode *inode, const char *function,
 943			  unsigned int line, const char *fmt, ...)
 944{
 945	struct va_format vaf;
 946	va_list args;
 947
 948	if (!ext4_warning_ratelimit(inode->i_sb))
 949		return;
 950
 951	va_start(args, fmt);
 952	vaf.fmt = fmt;
 953	vaf.va = &args;
 954	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 955	       "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 956	       function, line, inode->i_ino, current->comm, &vaf);
 957	va_end(args);
 958}
 959
 960void __ext4_grp_locked_error(const char *function, unsigned int line,
 961			     struct super_block *sb, ext4_group_t grp,
 962			     unsigned long ino, ext4_fsblk_t block,
 963			     const char *fmt, ...)
 964__releases(bitlock)
 965__acquires(bitlock)
 966{
 967	struct va_format vaf;
 968	va_list args;
 969
 970	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 971		return;
 972
 973	trace_ext4_error(sb, function, line);
 974	if (ext4_error_ratelimit(sb)) {
 975		va_start(args, fmt);
 976		vaf.fmt = fmt;
 977		vaf.va = &args;
 978		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 979		       sb->s_id, function, line, grp);
 980		if (ino)
 981			printk(KERN_CONT "inode %lu: ", ino);
 982		if (block)
 983			printk(KERN_CONT "block %llu:",
 984			       (unsigned long long) block);
 985		printk(KERN_CONT "%pV\n", &vaf);
 986		va_end(args);
 987	}
 988
 989	if (test_opt(sb, ERRORS_CONT)) {
 990		if (test_opt(sb, WARN_ON_ERROR))
 991			WARN_ON_ONCE(1);
 992		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 993		if (!bdev_read_only(sb->s_bdev)) {
 994			save_error_info(sb, EFSCORRUPTED, ino, block, function,
 995					line);
 996			schedule_work(&EXT4_SB(sb)->s_error_work);
 997		}
 998		return;
 999	}
1000	ext4_unlock_group(sb, grp);
1001	ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
1002	/*
1003	 * We only get here in the ERRORS_RO case; relocking the group
1004	 * may be dangerous, but nothing bad will happen since the
1005	 * filesystem will have already been marked read/only and the
1006	 * journal has been aborted.  We return 1 as a hint to callers
1007	 * who might what to use the return value from
1008	 * ext4_grp_locked_error() to distinguish between the
1009	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
1010	 * aggressively from the ext4 function in question, with a
1011	 * more appropriate error code.
1012	 */
1013	ext4_lock_group(sb, grp);
1014	return;
1015}
1016
1017void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
1018				     ext4_group_t group,
1019				     unsigned int flags)
1020{
1021	struct ext4_sb_info *sbi = EXT4_SB(sb);
1022	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1023	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
1024	int ret;
1025
1026	if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
1027		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1028					    &grp->bb_state);
1029		if (!ret)
1030			percpu_counter_sub(&sbi->s_freeclusters_counter,
1031					   grp->bb_free);
1032	}
1033
1034	if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
1035		ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
1036					    &grp->bb_state);
1037		if (!ret && gdp) {
1038			int count;
1039
1040			count = ext4_free_inodes_count(sb, gdp);
1041			percpu_counter_sub(&sbi->s_freeinodes_counter,
1042					   count);
1043		}
1044	}
1045}
1046
1047void ext4_update_dynamic_rev(struct super_block *sb)
1048{
1049	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1050
1051	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
1052		return;
1053
1054	ext4_warning(sb,
1055		     "updating to rev %d because of new feature flag, "
1056		     "running e2fsck is recommended",
1057		     EXT4_DYNAMIC_REV);
1058
1059	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
1060	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
1061	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
1062	/* leave es->s_feature_*compat flags alone */
1063	/* es->s_uuid will be set by e2fsck if empty */
1064
1065	/*
1066	 * The rest of the superblock fields should be zero, and if not it
1067	 * means they are likely already in use, so leave them alone.  We
1068	 * can leave it up to e2fsck to clean up any inconsistencies there.
1069	 */
1070}
1071
1072/*
1073 * Open the external journal device
1074 */
1075static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
1076{
1077	struct block_device *bdev;
1078
1079	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
1080	if (IS_ERR(bdev))
1081		goto fail;
1082	return bdev;
1083
1084fail:
1085	ext4_msg(sb, KERN_ERR,
1086		 "failed to open journal device unknown-block(%u,%u) %ld",
1087		 MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
1088	return NULL;
1089}
1090
1091/*
1092 * Release the journal device
1093 */
1094static void ext4_blkdev_put(struct block_device *bdev)
1095{
1096	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1097}
1098
1099static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
1100{
1101	struct block_device *bdev;
1102	bdev = sbi->s_journal_bdev;
1103	if (bdev) {
1104		ext4_blkdev_put(bdev);
1105		sbi->s_journal_bdev = NULL;
1106	}
1107}
1108
1109static inline struct inode *orphan_list_entry(struct list_head *l)
1110{
1111	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
1112}
1113
1114static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
1115{
1116	struct list_head *l;
1117
1118	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
1119		 le32_to_cpu(sbi->s_es->s_last_orphan));
1120
1121	printk(KERN_ERR "sb_info orphan list:\n");
1122	list_for_each(l, &sbi->s_orphan) {
1123		struct inode *inode = orphan_list_entry(l);
1124		printk(KERN_ERR "  "
1125		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
1126		       inode->i_sb->s_id, inode->i_ino, inode,
1127		       inode->i_mode, inode->i_nlink,
1128		       NEXT_ORPHAN(inode));
1129	}
1130}
1131
1132#ifdef CONFIG_QUOTA
1133static int ext4_quota_off(struct super_block *sb, int type);
1134
1135static inline void ext4_quota_off_umount(struct super_block *sb)
1136{
1137	int type;
1138
1139	/* Use our quota_off function to clear inode flags etc. */
1140	for (type = 0; type < EXT4_MAXQUOTAS; type++)
1141		ext4_quota_off(sb, type);
1142}
1143
1144/*
1145 * This is a helper function which is used in the mount/remount
1146 * codepaths (which holds s_umount) to fetch the quota file name.
1147 */
1148static inline char *get_qf_name(struct super_block *sb,
1149				struct ext4_sb_info *sbi,
1150				int type)
1151{
1152	return rcu_dereference_protected(sbi->s_qf_names[type],
1153					 lockdep_is_held(&sb->s_umount));
1154}
1155#else
1156static inline void ext4_quota_off_umount(struct super_block *sb)
1157{
1158}
1159#endif
1160
1161static void ext4_put_super(struct super_block *sb)
1162{
1163	struct ext4_sb_info *sbi = EXT4_SB(sb);
1164	struct ext4_super_block *es = sbi->s_es;
1165	struct buffer_head **group_desc;
1166	struct flex_groups **flex_groups;
1167	int aborted = 0;
1168	int i, err;
1169
1170	ext4_unregister_li_request(sb);
1171	ext4_quota_off_umount(sb);
1172
1173	flush_work(&sbi->s_error_work);
1174	destroy_workqueue(sbi->rsv_conversion_wq);
1175	ext4_release_orphan_info(sb);
1176
1177	/*
1178	 * Unregister sysfs before destroying jbd2 journal.
1179	 * Since we could still access attr_journal_task attribute via sysfs
1180	 * path which could have sbi->s_journal->j_task as NULL
1181	 */
1182	ext4_unregister_sysfs(sb);
1183
1184	if (sbi->s_journal) {
1185		aborted = is_journal_aborted(sbi->s_journal);
1186		err = jbd2_journal_destroy(sbi->s_journal);
1187		sbi->s_journal = NULL;
1188		if ((err < 0) && !aborted) {
1189			ext4_abort(sb, -err, "Couldn't clean up the journal");
1190		}
1191	}
1192
1193	ext4_es_unregister_shrinker(sbi);
1194	del_timer_sync(&sbi->s_err_report);
1195	ext4_release_system_zone(sb);
1196	ext4_mb_release(sb);
1197	ext4_ext_release(sb);
1198
1199	if (!sb_rdonly(sb) && !aborted) {
1200		ext4_clear_feature_journal_needs_recovery(sb);
1201		ext4_clear_feature_orphan_present(sb);
1202		es->s_state = cpu_to_le16(sbi->s_mount_state);
1203	}
1204	if (!sb_rdonly(sb))
1205		ext4_commit_super(sb);
1206
1207	rcu_read_lock();
1208	group_desc = rcu_dereference(sbi->s_group_desc);
1209	for (i = 0; i < sbi->s_gdb_count; i++)
1210		brelse(group_desc[i]);
1211	kvfree(group_desc);
1212	flex_groups = rcu_dereference(sbi->s_flex_groups);
1213	if (flex_groups) {
1214		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1215			kvfree(flex_groups[i]);
1216		kvfree(flex_groups);
1217	}
1218	rcu_read_unlock();
1219	percpu_counter_destroy(&sbi->s_freeclusters_counter);
1220	percpu_counter_destroy(&sbi->s_freeinodes_counter);
1221	percpu_counter_destroy(&sbi->s_dirs_counter);
1222	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1223	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
1224	percpu_free_rwsem(&sbi->s_writepages_rwsem);
1225#ifdef CONFIG_QUOTA
1226	for (i = 0; i < EXT4_MAXQUOTAS; i++)
1227		kfree(get_qf_name(sb, sbi, i));
1228#endif
1229
1230	/* Debugging code just in case the in-memory inode orphan list
1231	 * isn't empty.  The on-disk one can be non-empty if we've
1232	 * detected an error and taken the fs readonly, but the
1233	 * in-memory list had better be clean by this point. */
1234	if (!list_empty(&sbi->s_orphan))
1235		dump_orphan_list(sb, sbi);
1236	ASSERT(list_empty(&sbi->s_orphan));
1237
1238	sync_blockdev(sb->s_bdev);
1239	invalidate_bdev(sb->s_bdev);
1240	if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
1241		/*
1242		 * Invalidate the journal device's buffers.  We don't want them
1243		 * floating about in memory - the physical journal device may
1244		 * hotswapped, and it breaks the `ro-after' testing code.
1245		 */
1246		sync_blockdev(sbi->s_journal_bdev);
1247		invalidate_bdev(sbi->s_journal_bdev);
1248		ext4_blkdev_remove(sbi);
1249	}
1250
1251	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1252	sbi->s_ea_inode_cache = NULL;
1253
1254	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1255	sbi->s_ea_block_cache = NULL;
1256
1257	ext4_stop_mmpd(sbi);
1258
1259	brelse(sbi->s_sbh);
1260	sb->s_fs_info = NULL;
1261	/*
1262	 * Now that we are completely done shutting down the
1263	 * superblock, we need to actually destroy the kobject.
1264	 */
1265	kobject_put(&sbi->s_kobj);
1266	wait_for_completion(&sbi->s_kobj_unregister);
1267	if (sbi->s_chksum_driver)
1268		crypto_free_shash(sbi->s_chksum_driver);
1269	kfree(sbi->s_blockgroup_lock);
1270	fs_put_dax(sbi->s_daxdev);
1271	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
1272#ifdef CONFIG_UNICODE
1273	utf8_unload(sb->s_encoding);
1274#endif
1275	kfree(sbi);
1276}
1277
1278static struct kmem_cache *ext4_inode_cachep;
1279
1280/*
1281 * Called inside transaction, so use GFP_NOFS
1282 */
1283static struct inode *ext4_alloc_inode(struct super_block *sb)
1284{
1285	struct ext4_inode_info *ei;
1286
1287	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
1288	if (!ei)
1289		return NULL;
1290
1291	inode_set_iversion(&ei->vfs_inode, 1);
1292	spin_lock_init(&ei->i_raw_lock);
1293	INIT_LIST_HEAD(&ei->i_prealloc_list);
1294	atomic_set(&ei->i_prealloc_active, 0);
1295	spin_lock_init(&ei->i_prealloc_lock);
1296	ext4_es_init_tree(&ei->i_es_tree);
1297	rwlock_init(&ei->i_es_lock);
1298	INIT_LIST_HEAD(&ei->i_es_list);
1299	ei->i_es_all_nr = 0;
1300	ei->i_es_shk_nr = 0;
1301	ei->i_es_shrink_lblk = 0;
1302	ei->i_reserved_data_blocks = 0;
1303	spin_lock_init(&(ei->i_block_reservation_lock));
1304	ext4_init_pending_tree(&ei->i_pending_tree);
1305#ifdef CONFIG_QUOTA
1306	ei->i_reserved_quota = 0;
1307	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1308#endif
1309	ei->jinode = NULL;
1310	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1311	spin_lock_init(&ei->i_completed_io_lock);
1312	ei->i_sync_tid = 0;
1313	ei->i_datasync_tid = 0;
1314	atomic_set(&ei->i_unwritten, 0);
1315	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1316	ext4_fc_init_inode(&ei->vfs_inode);
1317	mutex_init(&ei->i_fc_lock);
1318	return &ei->vfs_inode;
1319}
1320
1321static int ext4_drop_inode(struct inode *inode)
1322{
1323	int drop = generic_drop_inode(inode);
1324
1325	if (!drop)
1326		drop = fscrypt_drop_inode(inode);
1327
1328	trace_ext4_drop_inode(inode, drop);
1329	return drop;
1330}
1331
1332static void ext4_free_in_core_inode(struct inode *inode)
1333{
1334	fscrypt_free_inode(inode);
1335	if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
1336		pr_warn("%s: inode %ld still in fc list",
1337			__func__, inode->i_ino);
1338	}
1339	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1340}
1341
1342static void ext4_destroy_inode(struct inode *inode)
1343{
1344	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1345		ext4_msg(inode->i_sb, KERN_ERR,
1346			 "Inode %lu (%p): orphan list check failed!",
1347			 inode->i_ino, EXT4_I(inode));
1348		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1349				EXT4_I(inode), sizeof(struct ext4_inode_info),
1350				true);
1351		dump_stack();
1352	}
1353}
1354
1355static void init_once(void *foo)
1356{
1357	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
1358
1359	INIT_LIST_HEAD(&ei->i_orphan);
1360	init_rwsem(&ei->xattr_sem);
1361	init_rwsem(&ei->i_data_sem);
1362	inode_init_once(&ei->vfs_inode);
1363	ext4_fc_init_inode(&ei->vfs_inode);
1364}
1365
1366static int __init init_inodecache(void)
1367{
1368	ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1369				sizeof(struct ext4_inode_info), 0,
1370				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1371					SLAB_ACCOUNT),
1372				offsetof(struct ext4_inode_info, i_data),
1373				sizeof_field(struct ext4_inode_info, i_data),
1374				init_once);
1375	if (ext4_inode_cachep == NULL)
1376		return -ENOMEM;
1377	return 0;
1378}
1379
1380static void destroy_inodecache(void)
1381{
1382	/*
1383	 * Make sure all delayed rcu free inodes are flushed before we
1384	 * destroy cache.
1385	 */
1386	rcu_barrier();
1387	kmem_cache_destroy(ext4_inode_cachep);
1388}
1389
1390void ext4_clear_inode(struct inode *inode)
1391{
1392	ext4_fc_del(inode);
1393	invalidate_inode_buffers(inode);
1394	clear_inode(inode);
1395	ext4_discard_preallocations(inode, 0);
1396	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1397	dquot_drop(inode);
1398	if (EXT4_I(inode)->jinode) {
1399		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1400					       EXT4_I(inode)->jinode);
1401		jbd2_free_inode(EXT4_I(inode)->jinode);
1402		EXT4_I(inode)->jinode = NULL;
1403	}
1404	fscrypt_put_encryption_info(inode);
1405	fsverity_cleanup_inode(inode);
1406}
1407
1408static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1409					u64 ino, u32 generation)
1410{
1411	struct inode *inode;
1412
1413	/*
1414	 * Currently we don't know the generation for parent directory, so
1415	 * a generation of 0 means "accept any"
1416	 */
1417	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1418	if (IS_ERR(inode))
1419		return ERR_CAST(inode);
1420	if (generation && inode->i_generation != generation) {
1421		iput(inode);
1422		return ERR_PTR(-ESTALE);
1423	}
1424
1425	return inode;
1426}
1427
1428static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1429					int fh_len, int fh_type)
1430{
1431	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1432				    ext4_nfs_get_inode);
1433}
1434
1435static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1436					int fh_len, int fh_type)
1437{
1438	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1439				    ext4_nfs_get_inode);
1440}
1441
1442static int ext4_nfs_commit_metadata(struct inode *inode)
1443{
1444	struct writeback_control wbc = {
1445		.sync_mode = WB_SYNC_ALL
1446	};
1447
1448	trace_ext4_nfs_commit_metadata(inode);
1449	return ext4_write_inode(inode, &wbc);
1450}
1451
1452#ifdef CONFIG_FS_ENCRYPTION
1453static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
1454{
1455	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
1456				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
1457}
1458
1459static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
1460							void *fs_data)
1461{
1462	handle_t *handle = fs_data;
1463	int res, res2, credits, retries = 0;
1464
1465	/*
1466	 * Encrypting the root directory is not allowed because e2fsck expects
1467	 * lost+found to exist and be unencrypted, and encrypting the root
1468	 * directory would imply encrypting the lost+found directory as well as
1469	 * the filename "lost+found" itself.
1470	 */
1471	if (inode->i_ino == EXT4_ROOT_INO)
1472		return -EPERM;
1473
1474	if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
1475		return -EINVAL;
1476
1477	if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
1478		return -EOPNOTSUPP;
1479
1480	res = ext4_convert_inline_data(inode);
1481	if (res)
1482		return res;
1483
1484	/*
1485	 * If a journal handle was specified, then the encryption context is
1486	 * being set on a new inode via inheritance and is part of a larger
1487	 * transaction to create the inode.  Otherwise the encryption context is
1488	 * being set on an existing inode in its own transaction.  Only in the
1489	 * latter case should the "retry on ENOSPC" logic be used.
1490	 */
1491
1492	if (handle) {
1493		res = ext4_xattr_set_handle(handle, inode,
1494					    EXT4_XATTR_INDEX_ENCRYPTION,
1495					    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1496					    ctx, len, 0);
1497		if (!res) {
1498			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1499			ext4_clear_inode_state(inode,
1500					EXT4_STATE_MAY_INLINE_DATA);
1501			/*
1502			 * Update inode->i_flags - S_ENCRYPTED will be enabled,
1503			 * S_DAX may be disabled
1504			 */
1505			ext4_set_inode_flags(inode, false);
1506		}
1507		return res;
1508	}
1509
1510	res = dquot_initialize(inode);
1511	if (res)
1512		return res;
1513retry:
1514	res = ext4_xattr_set_credits(inode, len, false /* is_create */,
1515				     &credits);
1516	if (res)
1517		return res;
1518
1519	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
1520	if (IS_ERR(handle))
1521		return PTR_ERR(handle);
1522
1523	res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
1524				    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1525				    ctx, len, 0);
1526	if (!res) {
1527		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1528		/*
1529		 * Update inode->i_flags - S_ENCRYPTED will be enabled,
1530		 * S_DAX may be disabled
1531		 */
1532		ext4_set_inode_flags(inode, false);
1533		res = ext4_mark_inode_dirty(handle, inode);
1534		if (res)
1535			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
1536	}
1537	res2 = ext4_journal_stop(handle);
1538
1539	if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1540		goto retry;
1541	if (!res)
1542		res = res2;
1543	return res;
1544}
1545
1546static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
1547{
1548	return EXT4_SB(sb)->s_dummy_enc_policy.policy;
1549}
1550
1551static bool ext4_has_stable_inodes(struct super_block *sb)
1552{
1553	return ext4_has_feature_stable_inodes(sb);
1554}
1555
1556static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
1557				       int *ino_bits_ret, int *lblk_bits_ret)
1558{
1559	*ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
1560	*lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
1561}
1562
1563static const struct fscrypt_operations ext4_cryptops = {
1564	.key_prefix		= "ext4:",
1565	.get_context		= ext4_get_context,
1566	.set_context		= ext4_set_context,
1567	.get_dummy_policy	= ext4_get_dummy_policy,
1568	.empty_dir		= ext4_empty_dir,
1569	.max_namelen		= EXT4_NAME_LEN,
1570	.has_stable_inodes	= ext4_has_stable_inodes,
1571	.get_ino_and_lblk_bits	= ext4_get_ino_and_lblk_bits,
1572};
1573#endif
1574
1575#ifdef CONFIG_QUOTA
1576static const char * const quotatypes[] = INITQFNAMES;
1577#define QTYPE2NAME(t) (quotatypes[t])
1578
1579static int ext4_write_dquot(struct dquot *dquot);
1580static int ext4_acquire_dquot(struct dquot *dquot);
1581static int ext4_release_dquot(struct dquot *dquot);
1582static int ext4_mark_dquot_dirty(struct dquot *dquot);
1583static int ext4_write_info(struct super_block *sb, int type);
1584static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1585			 const struct path *path);
1586static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1587			       size_t len, loff_t off);
1588static ssize_t ext4_quota_write(struct super_block *sb, int type,
1589				const char *data, size_t len, loff_t off);
1590static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1591			     unsigned int flags);
1592
1593static struct dquot **ext4_get_dquots(struct inode *inode)
1594{
1595	return EXT4_I(inode)->i_dquot;
1596}
1597
1598static const struct dquot_operations ext4_quota_operations = {
1599	.get_reserved_space	= ext4_get_reserved_space,
1600	.write_dquot		= ext4_write_dquot,
1601	.acquire_dquot		= ext4_acquire_dquot,
1602	.release_dquot		= ext4_release_dquot,
1603	.mark_dirty		= ext4_mark_dquot_dirty,
1604	.write_info		= ext4_write_info,
1605	.alloc_dquot		= dquot_alloc,
1606	.destroy_dquot		= dquot_destroy,
1607	.get_projid		= ext4_get_projid,
1608	.get_inode_usage	= ext4_get_inode_usage,
1609	.get_next_id		= dquot_get_next_id,
1610};
1611
1612static const struct quotactl_ops ext4_qctl_operations = {
1613	.quota_on	= ext4_quota_on,
1614	.quota_off	= ext4_quota_off,
1615	.quota_sync	= dquot_quota_sync,
1616	.get_state	= dquot_get_state,
1617	.set_info	= dquot_set_dqinfo,
1618	.get_dqblk	= dquot_get_dqblk,
1619	.set_dqblk	= dquot_set_dqblk,
1620	.get_nextdqblk	= dquot_get_next_dqblk,
1621};
1622#endif
1623
1624static const struct super_operations ext4_sops = {
1625	.alloc_inode	= ext4_alloc_inode,
1626	.free_inode	= ext4_free_in_core_inode,
1627	.destroy_inode	= ext4_destroy_inode,
1628	.write_inode	= ext4_write_inode,
1629	.dirty_inode	= ext4_dirty_inode,
1630	.drop_inode	= ext4_drop_inode,
1631	.evict_inode	= ext4_evict_inode,
1632	.put_super	= ext4_put_super,
1633	.sync_fs	= ext4_sync_fs,
1634	.freeze_fs	= ext4_freeze,
1635	.unfreeze_fs	= ext4_unfreeze,
1636	.statfs		= ext4_statfs,
1637	.remount_fs	= ext4_remount,
1638	.show_options	= ext4_show_options,
1639#ifdef CONFIG_QUOTA
1640	.quota_read	= ext4_quota_read,
1641	.quota_write	= ext4_quota_write,
1642	.get_dquots	= ext4_get_dquots,
1643#endif
1644};
1645
1646static const struct export_operations ext4_export_ops = {
1647	.fh_to_dentry = ext4_fh_to_dentry,
1648	.fh_to_parent = ext4_fh_to_parent,
1649	.get_parent = ext4_get_parent,
1650	.commit_metadata = ext4_nfs_commit_metadata,
1651};
1652
1653enum {
1654	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1655	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1656	Opt_nouid32, Opt_debug, Opt_removed,
1657	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1658	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1659	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1660	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1661	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1662	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1663	Opt_inlinecrypt,
1664	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1665	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1666	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1667	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
1668	Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1669	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1670	Opt_nowarn_on_error, Opt_mblk_io_submit,
1671	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
1672	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1673	Opt_inode_readahead_blks, Opt_journal_ioprio,
1674	Opt_dioread_nolock, Opt_dioread_lock,
1675	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1676	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1677	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
1678#ifdef CONFIG_EXT4_DEBUG
1679	Opt_fc_debug_max_replay, Opt_fc_debug_force
1680#endif
1681};
1682
1683static const match_table_t tokens = {
1684	{Opt_bsd_df, "bsddf"},
1685	{Opt_minix_df, "minixdf"},
1686	{Opt_grpid, "grpid"},
1687	{Opt_grpid, "bsdgroups"},
1688	{Opt_nogrpid, "nogrpid"},
1689	{Opt_nogrpid, "sysvgroups"},
1690	{Opt_resgid, "resgid=%u"},
1691	{Opt_resuid, "resuid=%u"},
1692	{Opt_sb, "sb=%u"},
1693	{Opt_err_cont, "errors=continue"},
1694	{Opt_err_panic, "errors=panic"},
1695	{Opt_err_ro, "errors=remount-ro"},
1696	{Opt_nouid32, "nouid32"},
1697	{Opt_debug, "debug"},
1698	{Opt_removed, "oldalloc"},
1699	{Opt_removed, "orlov"},
1700	{Opt_user_xattr, "user_xattr"},
1701	{Opt_nouser_xattr, "nouser_xattr"},
1702	{Opt_acl, "acl"},
1703	{Opt_noacl, "noacl"},
1704	{Opt_noload, "norecovery"},
1705	{Opt_noload, "noload"},
1706	{Opt_removed, "nobh"},
1707	{Opt_removed, "bh"},
1708	{Opt_commit, "commit=%u"},
1709	{Opt_min_batch_time, "min_batch_time=%u"},
1710	{Opt_max_batch_time, "max_batch_time=%u"},
1711	{Opt_journal_dev, "journal_dev=%u"},
1712	{Opt_journal_path, "journal_path=%s"},
1713	{Opt_journal_checksum, "journal_checksum"},
1714	{Opt_nojournal_checksum, "nojournal_checksum"},
1715	{Opt_journal_async_commit, "journal_async_commit"},
1716	{Opt_abort, "abort"},
1717	{Opt_data_journal, "data=journal"},
1718	{Opt_data_ordered, "data=ordered"},
1719	{Opt_data_writeback, "data=writeback"},
1720	{Opt_data_err_abort, "data_err=abort"},
1721	{Opt_data_err_ignore, "data_err=ignore"},
1722	{Opt_offusrjquota, "usrjquota="},
1723	{Opt_usrjquota, "usrjquota=%s"},
1724	{Opt_offgrpjquota, "grpjquota="},
1725	{Opt_grpjquota, "grpjquota=%s"},
1726	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1727	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1728	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1729	{Opt_grpquota, "grpquota"},
1730	{Opt_noquota, "noquota"},
1731	{Opt_quota, "quota"},
1732	{Opt_usrquota, "usrquota"},
1733	{Opt_prjquota, "prjquota"},
1734	{Opt_barrier, "barrier=%u"},
1735	{Opt_barrier, "barrier"},
1736	{Opt_nobarrier, "nobarrier"},
1737	{Opt_i_version, "i_version"},
1738	{Opt_dax, "dax"},
1739	{Opt_dax_always, "dax=always"},
1740	{Opt_dax_inode, "dax=inode"},
1741	{Opt_dax_never, "dax=never"},
1742	{Opt_stripe, "stripe=%u"},
1743	{Opt_delalloc, "delalloc"},
1744	{Opt_warn_on_error, "warn_on_error"},
1745	{Opt_nowarn_on_error, "nowarn_on_error"},
1746	{Opt_lazytime, "lazytime"},
1747	{Opt_nolazytime, "nolazytime"},
1748	{Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
1749	{Opt_nodelalloc, "nodelalloc"},
1750	{Opt_removed, "mblk_io_submit"},
1751	{Opt_removed, "nomblk_io_submit"},
1752	{Opt_block_validity, "block_validity"},
1753	{Opt_noblock_validity, "noblock_validity"},
1754	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1755	{Opt_journal_ioprio, "journal_ioprio=%u"},
1756	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
1757	{Opt_auto_da_alloc, "auto_da_alloc"},
1758	{Opt_noauto_da_alloc, "noauto_da_alloc"},
1759	{Opt_dioread_nolock, "dioread_nolock"},
1760	{Opt_dioread_lock, "nodioread_nolock"},
1761	{Opt_dioread_lock, "dioread_lock"},
1762	{Opt_discard, "discard"},
1763	{Opt_nodiscard, "nodiscard"},
1764	{Opt_init_itable, "init_itable=%u"},
1765	{Opt_init_itable, "init_itable"},
1766	{Opt_noinit_itable, "noinit_itable"},
1767#ifdef CONFIG_EXT4_DEBUG
1768	{Opt_fc_debug_force, "fc_debug_force"},
1769	{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
1770#endif
1771	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1772	{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
1773	{Opt_test_dummy_encryption, "test_dummy_encryption"},
1774	{Opt_inlinecrypt, "inlinecrypt"},
1775	{Opt_nombcache, "nombcache"},
1776	{Opt_nombcache, "no_mbcache"},	/* for backward compatibility */
1777	{Opt_removed, "prefetch_block_bitmaps"},
1778	{Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"},
1779	{Opt_mb_optimize_scan, "mb_optimize_scan=%d"},
1780	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
1781	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
1782	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
1783	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1784	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
1785	{Opt_err, NULL},
1786};
1787
1788static ext4_fsblk_t get_sb_block(void **data)
1789{
1790	ext4_fsblk_t	sb_block;
1791	char		*options = (char *) *data;
1792
1793	if (!options || strncmp(options, "sb=", 3) != 0)
1794		return 1;	/* Default location */
1795
1796	options += 3;
1797	/* TODO: use simple_strtoll with >32bit ext4 */
1798	sb_block = simple_strtoul(options, &options, 0);
1799	if (*options && *options != ',') {
1800		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1801		       (char *) *data);
1802		return 1;
1803	}
1804	if (*options == ',')
1805		options++;
1806	*data = (void *) options;
1807
1808	return sb_block;
1809}
1810
1811#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1812#define DEFAULT_MB_OPTIMIZE_SCAN	(-1)
1813
1814static const char deprecated_msg[] =
1815	"Mount option \"%s\" will be removed by %s\n"
1816	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1817
1818#ifdef CONFIG_QUOTA
1819static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1820{
1821	struct ext4_sb_info *sbi = EXT4_SB(sb);
1822	char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
1823	int ret = -1;
1824
1825	if (sb_any_quota_loaded(sb) && !old_qname) {
1826		ext4_msg(sb, KERN_ERR,
1827			"Cannot change journaled "
1828			"quota options when quota turned on");
1829		return -1;
1830	}
1831	if (ext4_has_feature_quota(sb)) {
1832		ext4_msg(sb, KERN_INFO, "Journaled quota options "
1833			 "ignored when QUOTA feature is enabled");
1834		return 1;
1835	}
1836	qname = match_strdup(args);
1837	if (!qname) {
1838		ext4_msg(sb, KERN_ERR,
1839			"Not enough memory for storing quotafile name");
1840		return -1;
1841	}
1842	if (old_qname) {
1843		if (strcmp(old_qname, qname) == 0)
1844			ret = 1;
1845		else
1846			ext4_msg(sb, KERN_ERR,
1847				 "%s quota file already specified",
1848				 QTYPE2NAME(qtype));
1849		goto errout;
1850	}
1851	if (strchr(qname, '/')) {
1852		ext4_msg(sb, KERN_ERR,
1853			"quotafile must be on filesystem root");
1854		goto errout;
1855	}
1856	rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
1857	set_opt(sb, QUOTA);
1858	return 1;
1859errout:
1860	kfree(qname);
1861	return ret;
1862}
1863
1864static int clear_qf_name(struct super_block *sb, int qtype)
1865{
1866
1867	struct ext4_sb_info *sbi = EXT4_SB(sb);
1868	char *old_qname = get_qf_name(sb, sbi, qtype);
1869
1870	if (sb_any_quota_loaded(sb) && old_qname) {
1871		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1872			" when quota turned on");
1873		return -1;
1874	}
1875	rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
1876	synchronize_rcu();
1877	kfree(old_qname);
1878	return 1;
1879}
1880#endif
1881
1882#define MOPT_SET	0x0001
1883#define MOPT_CLEAR	0x0002
1884#define MOPT_NOSUPPORT	0x0004
1885#define MOPT_EXPLICIT	0x0008
1886#define MOPT_CLEAR_ERR	0x0010
1887#define MOPT_GTE0	0x0020
1888#ifdef CONFIG_QUOTA
1889#define MOPT_Q		0
1890#define MOPT_QFMT	0x0040
1891#else
1892#define MOPT_Q		MOPT_NOSUPPORT
1893#define MOPT_QFMT	MOPT_NOSUPPORT
1894#endif
1895#define MOPT_DATAJ	0x0080
1896#define MOPT_NO_EXT2	0x0100
1897#define MOPT_NO_EXT3	0x0200
1898#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1899#define MOPT_STRING	0x0400
1900#define MOPT_SKIP	0x0800
1901#define	MOPT_2		0x1000
1902
1903static const struct mount_opts {
1904	int	token;
1905	int	mount_opt;
1906	int	flags;
1907} ext4_mount_opts[] = {
1908	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1909	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1910	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1911	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1912	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1913	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1914	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1915	 MOPT_EXT4_ONLY | MOPT_SET},
1916	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1917	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1918	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1919	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1920	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1921	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1922	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1923	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1924	{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1925	{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1926	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1927	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1928	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1929	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1930	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1931				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1932	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1933	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1934	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1935	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1936	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1937	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1938	 MOPT_NO_EXT2},
1939	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1940	 MOPT_NO_EXT2},
1941	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1942	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1943	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1944	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1945	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1946	{Opt_commit, 0, MOPT_GTE0},
1947	{Opt_max_batch_time, 0, MOPT_GTE0},
1948	{Opt_min_batch_time, 0, MOPT_GTE0},
1949	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
1950	{Opt_init_itable, 0, MOPT_GTE0},
1951	{Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
1952	{Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
1953		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1954	{Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
1955		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1956	{Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
1957		MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1958	{Opt_stripe, 0, MOPT_GTE0},
1959	{Opt_resuid, 0, MOPT_GTE0},
1960	{Opt_resgid, 0, MOPT_GTE0},
1961	{Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1962	{Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
1963	{Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1964	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1965	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1966	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1967	 MOPT_NO_EXT2 | MOPT_DATAJ},
1968	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1969	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1970#ifdef CONFIG_EXT4_FS_POSIX_ACL
1971	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1972	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1973#else
1974	{Opt_acl, 0, MOPT_NOSUPPORT},
1975	{Opt_noacl, 0, MOPT_NOSUPPORT},
1976#endif
1977	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1978	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1979	{Opt_debug_want_extra_isize, 0, MOPT_GTE0},
1980	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1981	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1982							MOPT_SET | MOPT_Q},
1983	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1984							MOPT_SET | MOPT_Q},
1985	{Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1986							MOPT_SET | MOPT_Q},
1987	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1988		       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1989							MOPT_CLEAR | MOPT_Q},
1990	{Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
1991	{Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
1992	{Opt_offusrjquota, 0, MOPT_Q},
1993	{Opt_offgrpjquota, 0, MOPT_Q},
1994	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1995	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1996	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1997	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
1998	{Opt_test_dummy_encryption, 0, MOPT_STRING},
1999	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
2000	{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
2001	 MOPT_SET},
2002	{Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0},
2003#ifdef CONFIG_EXT4_DEBUG
2004	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
2005	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
2006	{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
2007#endif
2008	{Opt_err, 0, 0}
2009};
2010
2011#ifdef CONFIG_UNICODE
2012static const struct ext4_sb_encodings {
2013	__u16 magic;
2014	char *name;
2015	char *version;
2016} ext4_sb_encoding_map[] = {
2017	{EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
2018};
2019
2020static int ext4_sb_read_encoding(const struct ext4_super_block *es,
2021				 const struct ext4_sb_encodings **encoding,
2022				 __u16 *flags)
2023{
2024	__u16 magic = le16_to_cpu(es->s_encoding);
2025	int i;
2026
2027	for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
2028		if (magic == ext4_sb_encoding_map[i].magic)
2029			break;
2030
2031	if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
2032		return -EINVAL;
2033
2034	*encoding = &ext4_sb_encoding_map[i];
2035	*flags = le16_to_cpu(es->s_encoding_flags);
2036
2037	return 0;
2038}
2039#endif
2040
2041static int ext4_set_test_dummy_encryption(struct super_block *sb,
2042					  const char *opt,
2043					  const substring_t *arg,
2044					  bool is_remount)
2045{
2046#ifdef CONFIG_FS_ENCRYPTION
2047	struct ext4_sb_info *sbi = EXT4_SB(sb);
2048	int err;
2049
2050	/*
2051	 * This mount option is just for testing, and it's not worthwhile to
2052	 * implement the extra complexity (e.g. RCU protection) that would be
2053	 * needed to allow it to be set or changed during remount.  We do allow
2054	 * it to be specified during remount, but only if there is no change.
2055	 */
2056	if (is_remount && !sbi->s_dummy_enc_policy.policy) {
2057		ext4_msg(sb, KERN_WARNING,
2058			 "Can't set test_dummy_encryption on remount");
2059		return -1;
2060	}
2061	err = fscrypt_set_test_dummy_encryption(sb, arg->from,
2062						&sbi->s_dummy_enc_policy);
2063	if (err) {
2064		if (err == -EEXIST)
2065			ext4_msg(sb, KERN_WARNING,
2066				 "Can't change test_dummy_encryption on remount");
2067		else if (err == -EINVAL)
2068			ext4_msg(sb, KERN_WARNING,
2069				 "Value of option \"%s\" is unrecognized", opt);
2070		else
2071			ext4_msg(sb, KERN_WARNING,
2072				 "Error processing option \"%s\" [%d]",
2073				 opt, err);
2074		return -1;
2075	}
2076	ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
2077#else
2078	ext4_msg(sb, KERN_WARNING,
2079		 "Test dummy encryption mount option ignored");
2080#endif
2081	return 1;
2082}
2083
2084struct ext4_parsed_options {
2085	unsigned long journal_devnum;
2086	unsigned int journal_ioprio;
2087	int mb_optimize_scan;
2088};
2089
2090static int handle_mount_opt(struct super_block *sb, char *opt, int token,
2091			    substring_t *args, struct ext4_parsed_options *parsed_opts,
2092			    int is_remount)
2093{
2094	struct ext4_sb_info *sbi = EXT4_SB(sb);
2095	const struct mount_opts *m;
2096	kuid_t uid;
2097	kgid_t gid;
2098	int arg = 0;
2099
2100#ifdef CONFIG_QUOTA
2101	if (token == Opt_usrjquota)
2102		return set_qf_name(sb, USRQUOTA, &args[0]);
2103	else if (token == Opt_grpjquota)
2104		return set_qf_name(sb, GRPQUOTA, &args[0]);
2105	else if (token == Opt_offusrjquota)
2106		return clear_qf_name(sb, USRQUOTA);
2107	else if (token == Opt_offgrpjquota)
2108		return clear_qf_name(sb, GRPQUOTA);
2109#endif
2110	switch (token) {
2111	case Opt_noacl:
2112	case Opt_nouser_xattr:
2113		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
2114		break;
2115	case Opt_sb:
2116		return 1;	/* handled by get_sb_block() */
2117	case Opt_removed:
2118		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
2119		return 1;
2120	case Opt_abort:
2121		ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
2122		return 1;
2123	case Opt_i_version:
2124		sb->s_flags |= SB_I_VERSION;
2125		return 1;
2126	case Opt_lazytime:
2127		sb->s_flags |= SB_LAZYTIME;
2128		return 1;
2129	case Opt_nolazytime:
2130		sb->s_flags &= ~SB_LAZYTIME;
2131		return 1;
2132	case Opt_inlinecrypt:
2133#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
2134		sb->s_flags |= SB_INLINECRYPT;
2135#else
2136		ext4_msg(sb, KERN_ERR, "inline encryption not supported");
2137#endif
2138		return 1;
2139	}
2140
2141	for (m = ext4_mount_opts; m->token != Opt_err; m++)
2142		if (token == m->token)
2143			break;
2144
2145	if (m->token == Opt_err) {
2146		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
2147			 "or missing value", opt);
2148		return -1;
2149	}
2150
2151	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
2152		ext4_msg(sb, KERN_ERR,
2153			 "Mount option \"%s\" incompatible with ext2", opt);
2154		return -1;
2155	}
2156	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
2157		ext4_msg(sb, KERN_ERR,
2158			 "Mount option \"%s\" incompatible with ext3", opt);
2159		return -1;
2160	}
2161
2162	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
2163		return -1;
2164	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
2165		return -1;
2166	if (m->flags & MOPT_EXPLICIT) {
2167		if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
2168			set_opt2(sb, EXPLICIT_DELALLOC);
2169		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
2170			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
2171		} else
2172			return -1;
2173	}
2174	if (m->flags & MOPT_CLEAR_ERR)
2175		clear_opt(sb, ERRORS_MASK);
2176	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
2177		ext4_msg(sb, KERN_ERR, "Cannot change quota "
2178			 "options when quota turned on");
2179		return -1;
2180	}
2181
2182	if (m->flags & MOPT_NOSUPPORT) {
2183		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
2184	} else if (token == Opt_commit) {
2185		if (arg == 0)
2186			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
2187		else if (arg > INT_MAX / HZ) {
2188			ext4_msg(sb, KERN_ERR,
2189				 "Invalid commit interval %d, "
2190				 "must be smaller than %d",
2191				 arg, INT_MAX / HZ);
2192			return -1;
2193		}
2194		sbi->s_commit_interval = HZ * arg;
2195	} else if (token == Opt_debug_want_extra_isize) {
2196		if ((arg & 1) ||
2197		    (arg < 4) ||
2198		    (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
2199			ext4_msg(sb, KERN_ERR,
2200				 "Invalid want_extra_isize %d", arg);
2201			return -1;
2202		}
2203		sbi->s_want_extra_isize = arg;
2204	} else if (token == Opt_max_batch_time) {
2205		sbi->s_max_batch_time = arg;
2206	} else if (token == Opt_min_batch_time) {
2207		sbi->s_min_batch_time = arg;
2208	} else if (token == Opt_inode_readahead_blks) {
2209		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
2210			ext4_msg(sb, KERN_ERR,
2211				 "EXT4-fs: inode_readahead_blks must be "
2212				 "0 or a power of 2 smaller than 2^31");
2213			return -1;
2214		}
2215		sbi->s_inode_readahead_blks = arg;
2216	} else if (token == Opt_init_itable) {
2217		set_opt(sb, INIT_INODE_TABLE);
2218		if (!args->from)
2219			arg = EXT4_DEF_LI_WAIT_MULT;
2220		sbi->s_li_wait_mult = arg;
2221	} else if (token == Opt_max_dir_size_kb) {
2222		sbi->s_max_dir_size_kb = arg;
2223#ifdef CONFIG_EXT4_DEBUG
2224	} else if (token == Opt_fc_debug_max_replay) {
2225		sbi->s_fc_debug_max_replay = arg;
2226#endif
2227	} else if (token == Opt_stripe) {
2228		sbi->s_stripe = arg;
2229	} else if (token == Opt_resuid) {
2230		uid = make_kuid(current_user_ns(), arg);
2231		if (!uid_valid(uid)) {
2232			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
2233			return -1;
2234		}
2235		sbi->s_resuid = uid;
2236	} else if (token == Opt_resgid) {
2237		gid = make_kgid(current_user_ns(), arg);
2238		if (!gid_valid(gid)) {
2239			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
2240			return -1;
2241		}
2242		sbi->s_resgid = gid;
2243	} else if (token == Opt_journal_dev) {
2244		if (is_remount) {
2245			ext4_msg(sb, KERN_ERR,
2246				 "Cannot specify journal on remount");
2247			return -1;
2248		}
2249		parsed_opts->journal_devnum = arg;
2250	} else if (token == Opt_journal_path) {
2251		char *journal_path;
2252		struct inode *journal_inode;
2253		struct path path;
2254		int error;
2255
2256		if (is_remount) {
2257			ext4_msg(sb, KERN_ERR,
2258				 "Cannot specify journal on remount");
2259			return -1;
2260		}
2261		journal_path = match_strdup(&args[0]);
2262		if (!journal_path) {
2263			ext4_msg(sb, KERN_ERR, "error: could not dup "
2264				"journal device string");
2265			return -1;
2266		}
2267
2268		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
2269		if (error) {
2270			ext4_msg(sb, KERN_ERR, "error: could not find "
2271				"journal device path: error %d", error);
2272			kfree(journal_path);
2273			return -1;
2274		}
2275
2276		journal_inode = d_inode(path.dentry);
2277		if (!S_ISBLK(journal_inode->i_mode)) {
2278			ext4_msg(sb, KERN_ERR, "error: journal path %s "
2279				"is not a block device", journal_path);
2280			path_put(&path);
2281			kfree(journal_path);
2282			return -1;
2283		}
2284
2285		parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev);
2286		path_put(&path);
2287		kfree(journal_path);
2288	} else if (token == Opt_journal_ioprio) {
2289		if (arg > 7) {
2290			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
2291				 " (must be 0-7)");
2292			return -1;
2293		}
2294		parsed_opts->journal_ioprio =
2295			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
2296	} else if (token == Opt_test_dummy_encryption) {
2297		return ext4_set_test_dummy_encryption(sb, opt, &args[0],
2298						      is_remount);
2299	} else if (m->flags & MOPT_DATAJ) {
2300		if (is_remount) {
2301			if (!sbi->s_journal)
2302				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
2303			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
2304				ext4_msg(sb, KERN_ERR,
2305					 "Cannot change data mode on remount");
2306				return -1;
2307			}
2308		} else {
2309			clear_opt(sb, DATA_FLAGS);
2310			sbi->s_mount_opt |= m->mount_opt;
2311		}
2312#ifdef CONFIG_QUOTA
2313	} else if (m->flags & MOPT_QFMT) {
2314		if (sb_any_quota_loaded(sb) &&
2315		    sbi->s_jquota_fmt != m->mount_opt) {
2316			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
2317				 "quota options when quota turned on");
2318			return -1;
2319		}
2320		if (ext4_has_feature_quota(sb)) {
2321			ext4_msg(sb, KERN_INFO,
2322				 "Quota format mount options ignored "
2323				 "when QUOTA feature is enabled");
2324			return 1;
2325		}
2326		sbi->s_jquota_fmt = m->mount_opt;
2327#endif
2328	} else if (token == Opt_dax || token == Opt_dax_always ||
2329		   token == Opt_dax_inode || token == Opt_dax_never) {
2330#ifdef CONFIG_FS_DAX
2331		switch (token) {
2332		case Opt_dax:
2333		case Opt_dax_always:
2334			if (is_remount &&
2335			    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2336			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2337			fail_dax_change_remount:
2338				ext4_msg(sb, KERN_ERR, "can't change "
2339					 "dax mount option while remounting");
2340				return -1;
2341			}
2342			if (is_remount &&
2343			    (test_opt(sb, DATA_FLAGS) ==
2344			     EXT4_MOUNT_JOURNAL_DATA)) {
2345				    ext4_msg(sb, KERN_ERR, "can't mount with "
2346					     "both data=journal and dax");
2347				    return -1;
2348			}
2349			ext4_msg(sb, KERN_WARNING,
2350				"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
2351			sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
2352			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2353			break;
2354		case Opt_dax_never:
2355			if (is_remount &&
2356			    (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2357			     (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
2358				goto fail_dax_change_remount;
2359			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2360			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2361			break;
2362		case Opt_dax_inode:
2363			if (is_remount &&
2364			    ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2365			     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2366			     !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
2367				goto fail_dax_change_remount;
2368			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2369			sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2370			/* Strictly for printing options */
2371			sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
2372			break;
2373		}
2374#else
2375		ext4_msg(sb, KERN_INFO, "dax option not supported");
2376		sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2377		sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2378		return -1;
2379#endif
2380	} else if (token == Opt_data_err_abort) {
2381		sbi->s_mount_opt |= m->mount_opt;
2382	} else if (token == Opt_data_err_ignore) {
2383		sbi->s_mount_opt &= ~m->mount_opt;
2384	} else if (token == Opt_mb_optimize_scan) {
2385		if (arg != 0 && arg != 1) {
2386			ext4_msg(sb, KERN_WARNING,
2387				 "mb_optimize_scan should be set to 0 or 1.");
2388			return -1;
2389		}
2390		parsed_opts->mb_optimize_scan = arg;
2391	} else {
2392		if (!args->from)
2393			arg = 1;
2394		if (m->flags & MOPT_CLEAR)
2395			arg = !arg;
2396		else if (unlikely(!(m->flags & MOPT_SET))) {
2397			ext4_msg(sb, KERN_WARNING,
2398				 "buggy handling of option %s", opt);
2399			WARN_ON(1);
2400			return -1;
2401		}
2402		if (m->flags & MOPT_2) {
2403			if (arg != 0)
2404				sbi->s_mount_opt2 |= m->mount_opt;
2405			else
2406				sbi->s_mount_opt2 &= ~m->mount_opt;
2407		} else {
2408			if (arg != 0)
2409				sbi->s_mount_opt |= m->mount_opt;
2410			else
2411				sbi->s_mount_opt &= ~m->mount_opt;
2412		}
2413	}
2414	return 1;
2415}
2416
2417static int parse_options(char *options, struct super_block *sb,
2418			 struct ext4_parsed_options *ret_opts,
2419			 int is_remount)
2420{
2421	struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
2422	char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
2423	substring_t args[MAX_OPT_ARGS];
2424	int token;
2425
2426	if (!options)
2427		return 1;
2428
2429	while ((p = strsep(&options, ",")) != NULL) {
2430		if (!*p)
2431			continue;
2432		/*
2433		 * Initialize args struct so we know whether arg was
2434		 * found; some options take optional arguments.
2435		 */
2436		args[0].to = args[0].from = NULL;
2437		token = match_token(p, tokens, args);
2438		if (handle_mount_opt(sb, p, token, args, ret_opts,
2439				     is_remount) < 0)
2440			return 0;
2441	}
2442#ifdef CONFIG_QUOTA
2443	/*
2444	 * We do the test below only for project quotas. 'usrquota' and
2445	 * 'grpquota' mount options are allowed even without quota feature
2446	 * to support legacy quotas in quota files.
2447	 */
2448	if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
2449		ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
2450			 "Cannot enable project quota enforcement.");
2451		return 0;
2452	}
2453	usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
2454	grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
2455	if (usr_qf_name || grp_qf_name) {
2456		if (test_opt(sb, USRQUOTA) && usr_qf_name)
2457			clear_opt(sb, USRQUOTA);
2458
2459		if (test_opt(sb, GRPQUOTA) && grp_qf_name)
2460			clear_opt(sb, GRPQUOTA);
2461
2462		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
2463			ext4_msg(sb, KERN_ERR, "old and new quota "
2464					"format mixing");
2465			return 0;
2466		}
2467
2468		if (!sbi->s_jquota_fmt) {
2469			ext4_msg(sb, KERN_ERR, "journaled quota format "
2470					"not specified");
2471			return 0;
2472		}
2473	}
2474#endif
2475	if (test_opt(sb, DIOREAD_NOLOCK)) {
2476		int blocksize =
2477			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2478		if (blocksize < PAGE_SIZE)
2479			ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
2480				 "experimental mount option 'dioread_nolock' "
2481				 "for blocksize < PAGE_SIZE");
2482	}
2483	return 1;
2484}
2485
2486static inline void ext4_show_quota_options(struct seq_file *seq,
2487					   struct super_block *sb)
2488{
2489#if defined(CONFIG_QUOTA)
2490	struct ext4_sb_info *sbi = EXT4_SB(sb);
2491	char *usr_qf_name, *grp_qf_name;
2492
2493	if (sbi->s_jquota_fmt) {
2494		char *fmtname = "";
2495
2496		switch (sbi->s_jquota_fmt) {
2497		case QFMT_VFS_OLD:
2498			fmtname = "vfsold";
2499			break;
2500		case QFMT_VFS_V0:
2501			fmtname = "vfsv0";
2502			break;
2503		case QFMT_VFS_V1:
2504			fmtname = "vfsv1";
2505			break;
2506		}
2507		seq_printf(seq, ",jqfmt=%s", fmtname);
2508	}
2509
2510	rcu_read_lock();
2511	usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2512	grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2513	if (usr_qf_name)
2514		seq_show_option(seq, "usrjquota", usr_qf_name);
2515	if (grp_qf_name)
2516		seq_show_option(seq, "grpjquota", grp_qf_name);
2517	rcu_read_unlock();
2518#endif
2519}
2520
2521static const char *token2str(int token)
2522{
2523	const struct match_token *t;
2524
2525	for (t = tokens; t->token != Opt_err; t++)
2526		if (t->token == token && !strchr(t->pattern, '='))
2527			break;
2528	return t->pattern;
2529}
2530
2531/*
2532 * Show an option if
2533 *  - it's set to a non-default value OR
2534 *  - if the per-sb default is different from the global default
2535 */
2536static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2537			      int nodefs)
2538{
2539	struct ext4_sb_info *sbi = EXT4_SB(sb);
2540	struct ext4_super_block *es = sbi->s_es;
2541	int def_errors, def_mount_opt = sbi->s_def_mount_opt;
2542	const struct mount_opts *m;
2543	char sep = nodefs ? '\n' : ',';
2544
2545#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2546#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2547
2548	if (sbi->s_sb_block != 1)
2549		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2550
2551	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2552		int want_set = m->flags & MOPT_SET;
2553		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2554		    (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
2555			continue;
2556		if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
2557			continue; /* skip if same as the default */
2558		if ((want_set &&
2559		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
2560		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
2561			continue; /* select Opt_noFoo vs Opt_Foo */
2562		SEQ_OPTS_PRINT("%s", token2str(m->token));
2563	}
2564
2565	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2566	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2567		SEQ_OPTS_PRINT("resuid=%u",
2568				from_kuid_munged(&init_user_ns, sbi->s_resuid));
2569	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
2570	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
2571		SEQ_OPTS_PRINT("resgid=%u",
2572				from_kgid_munged(&init_user_ns, sbi->s_resgid));
2573	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
2574	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
2575		SEQ_OPTS_PUTS("errors=remount-ro");
2576	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
2577		SEQ_OPTS_PUTS("errors=continue");
2578	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
2579		SEQ_OPTS_PUTS("errors=panic");
2580	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
2581		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
2582	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
2583		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
2584	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
2585		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
2586	if (sb->s_flags & SB_I_VERSION)
2587		SEQ_OPTS_PUTS("i_version");
2588	if (nodefs || sbi->s_stripe)
2589		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
2590	if (nodefs || EXT4_MOUNT_DATA_FLAGS &
2591			(sbi->s_mount_opt ^ def_mount_opt)) {
2592		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2593			SEQ_OPTS_PUTS("data=journal");
2594		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2595			SEQ_OPTS_PUTS("data=ordered");
2596		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
2597			SEQ_OPTS_PUTS("data=writeback");
2598	}
2599	if (nodefs ||
2600	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
2601		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
2602			       sbi->s_inode_readahead_blks);
2603
2604	if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
2605		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
2606		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
2607	if (nodefs || sbi->s_max_dir_size_kb)
2608		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
2609	if (test_opt(sb, DATA_ERR_ABORT))
2610		SEQ_OPTS_PUTS("data_err=abort");
2611
2612	fscrypt_show_test_dummy_encryption(seq, sep, sb);
2613
2614	if (sb->s_flags & SB_INLINECRYPT)
2615		SEQ_OPTS_PUTS("inlinecrypt");
2616
2617	if (test_opt(sb, DAX_ALWAYS)) {
2618		if (IS_EXT2_SB(sb))
2619			SEQ_OPTS_PUTS("dax");
2620		else
2621			SEQ_OPTS_PUTS("dax=always");
2622	} else if (test_opt2(sb, DAX_NEVER)) {
2623		SEQ_OPTS_PUTS("dax=never");
2624	} else if (test_opt2(sb, DAX_INODE)) {
2625		SEQ_OPTS_PUTS("dax=inode");
2626	}
2627	ext4_show_quota_options(seq, sb);
2628	return 0;
2629}
2630
2631static int ext4_show_options(struct seq_file *seq, struct dentry *root)
2632{
2633	return _ext4_show_options(seq, root->d_sb, 0);
2634}
2635
2636int ext4_seq_options_show(struct seq_file *seq, void *offset)
2637{
2638	struct super_block *sb = seq->private;
2639	int rc;
2640
2641	seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
2642	rc = _ext4_show_options(seq, sb, 1);
2643	seq_puts(seq, "\n");
2644	return rc;
2645}
2646
2647static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2648			    int read_only)
2649{
2650	struct ext4_sb_info *sbi = EXT4_SB(sb);
2651	int err = 0;
2652
2653	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
2654		ext4_msg(sb, KERN_ERR, "revision level too high, "
2655			 "forcing read-only mode");
2656		err = -EROFS;
2657		goto done;
2658	}
2659	if (read_only)
2660		goto done;
2661	if (!(sbi->s_mount_state & EXT4_VALID_FS))
2662		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
2663			 "running e2fsck is recommended");
2664	else if (sbi->s_mount_state & EXT4_ERROR_FS)
2665		ext4_msg(sb, KERN_WARNING,
2666			 "warning: mounting fs with errors, "
2667			 "running e2fsck is recommended");
2668	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
2669		 le16_to_cpu(es->s_mnt_count) >=
2670		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
2671		ext4_msg(sb, KERN_WARNING,
2672			 "warning: maximal mount count reached, "
2673			 "running e2fsck is recommended");
2674	else if (le32_to_cpu(es->s_checkinterval) &&
2675		 (ext4_get_tstamp(es, s_lastcheck) +
2676		  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
2677		ext4_msg(sb, KERN_WARNING,
2678			 "warning: checktime reached, "
2679			 "running e2fsck is recommended");
2680	if (!sbi->s_journal)
2681		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
2682	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
2683		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
2684	le16_add_cpu(&es->s_mnt_count, 1);
2685	ext4_update_tstamp(es, s_mtime);
2686	if (sbi->s_journal) {
2687		ext4_set_feature_journal_needs_recovery(sb);
2688		if (ext4_has_feature_orphan_file(sb))
2689			ext4_set_feature_orphan_present(sb);
2690	}
2691
2692	err = ext4_commit_super(sb);
2693done:
2694	if (test_opt(sb, DEBUG))
2695		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
2696				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
2697			sb->s_blocksize,
2698			sbi->s_groups_count,
2699			EXT4_BLOCKS_PER_GROUP(sb),
2700			EXT4_INODES_PER_GROUP(sb),
2701			sbi->s_mount_opt, sbi->s_mount_opt2);
2702
2703	cleancache_init_fs(sb);
2704	return err;
2705}
2706
2707int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
2708{
2709	struct ext4_sb_info *sbi = EXT4_SB(sb);
2710	struct flex_groups **old_groups, **new_groups;
2711	int size, i, j;
2712
2713	if (!sbi->s_log_groups_per_flex)
2714		return 0;
2715
2716	size = ext4_flex_group(sbi, ngroup - 1) + 1;
2717	if (size <= sbi->s_flex_groups_allocated)
2718		return 0;
2719
2720	new_groups = kvzalloc(roundup_pow_of_two(size *
2721			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
2722	if (!new_groups) {
2723		ext4_msg(sb, KERN_ERR,
2724			 "not enough memory for %d flex group pointers", size);
2725		return -ENOMEM;
2726	}
2727	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
2728		new_groups[i] = kvzalloc(roundup_pow_of_two(
2729					 sizeof(struct flex_groups)),
2730					 GFP_KERNEL);
2731		if (!new_groups[i]) {
2732			for (j = sbi->s_flex_groups_allocated; j < i; j++)
2733				kvfree(new_groups[j]);
2734			kvfree(new_groups);
2735			ext4_msg(sb, KERN_ERR,
2736				 "not enough memory for %d flex groups", size);
2737			return -ENOMEM;
2738		}
2739	}
2740	rcu_read_lock();
2741	old_groups = rcu_dereference(sbi->s_flex_groups);
2742	if (old_groups)
2743		memcpy(new_groups, old_groups,
2744		       (sbi->s_flex_groups_allocated *
2745			sizeof(struct flex_groups *)));
2746	rcu_read_unlock();
2747	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
2748	sbi->s_flex_groups_allocated = size;
2749	if (old_groups)
2750		ext4_kvfree_array_rcu(old_groups);
2751	return 0;
2752}
2753
2754static int ext4_fill_flex_info(struct super_block *sb)
2755{
2756	struct ext4_sb_info *sbi = EXT4_SB(sb);
2757	struct ext4_group_desc *gdp = NULL;
2758	struct flex_groups *fg;
2759	ext4_group_t flex_group;
2760	int i, err;
2761
2762	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
2763	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
2764		sbi->s_log_groups_per_flex = 0;
2765		return 1;
2766	}
2767
2768	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
2769	if (err)
2770		goto failed;
2771
2772	for (i = 0; i < sbi->s_groups_count; i++) {
2773		gdp = ext4_get_group_desc(sb, i, NULL);
2774
2775		flex_group = ext4_flex_group(sbi, i);
2776		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
2777		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
2778		atomic64_add(ext4_free_group_clusters(sb, gdp),
2779			     &fg->free_clusters);
2780		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
2781	}
2782
2783	return 1;
2784failed:
2785	return 0;
2786}
2787
2788static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
2789				   struct ext4_group_desc *gdp)
2790{
2791	int offset = offsetof(struct ext4_group_desc, bg_checksum);
2792	__u16 crc = 0;
2793	__le32 le_group = cpu_to_le32(block_group);
2794	struct ext4_sb_info *sbi = EXT4_SB(sb);
2795
2796	if (ext4_has_metadata_csum(sbi->s_sb)) {
2797		/* Use new metadata_csum algorithm */
2798		__u32 csum32;
2799		__u16 dummy_csum = 0;
2800
2801		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
2802				     sizeof(le_group));
2803		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
2804		csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
2805				     sizeof(dummy_csum));
2806		offset += sizeof(dummy_csum);
2807		if (offset < sbi->s_desc_size)
2808			csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
2809					     sbi->s_desc_size - offset);
2810
2811		crc = csum32 & 0xFFFF;
2812		goto out;
2813	}
2814
2815	/* old crc16 code */
2816	if (!ext4_has_feature_gdt_csum(sb))
2817		return 0;
2818
2819	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2820	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2821	crc = crc16(crc, (__u8 *)gdp, offset);
2822	offset += sizeof(gdp->bg_checksum); /* skip checksum */
2823	/* for checksum of struct ext4_group_desc do the rest...*/
2824	if (ext4_has_feature_64bit(sb) &&
2825	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
2826		crc = crc16(crc, (__u8 *)gdp + offset,
2827			    le16_to_cpu(sbi->s_es->s_desc_size) -
2828				offset);
2829
2830out:
2831	return cpu_to_le16(crc);
2832}
2833
2834int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2835				struct ext4_group_desc *gdp)
2836{
2837	if (ext4_has_group_desc_csum(sb) &&
2838	    (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
2839		return 0;
2840
2841	return 1;
2842}
2843
2844void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2845			      struct ext4_group_desc *gdp)
2846{
2847	if (!ext4_has_group_desc_csum(sb))
2848		return;
2849	gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
2850}
2851
2852/* Called at mount-time, super-block is locked */
2853static int ext4_check_descriptors(struct super_block *sb,
2854				  ext4_fsblk_t sb_block,
2855				  ext4_group_t *first_not_zeroed)
2856{
2857	struct ext4_sb_info *sbi = EXT4_SB(sb);
2858	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2859	ext4_fsblk_t last_block;
2860	ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
2861	ext4_fsblk_t block_bitmap;
2862	ext4_fsblk_t inode_bitmap;
2863	ext4_fsblk_t inode_table;
2864	int flexbg_flag = 0;
2865	ext4_group_t i, grp = sbi->s_groups_count;
2866
2867	if (ext4_has_feature_flex_bg(sb))
2868		flexbg_flag = 1;
2869
2870	ext4_debug("Checking group descriptors");
2871
2872	for (i = 0; i < sbi->s_groups_count; i++) {
2873		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2874
2875		if (i == sbi->s_groups_count - 1 || flexbg_flag)
2876			last_block = ext4_blocks_count(sbi->s_es) - 1;
2877		else
2878			last_block = first_block +
2879				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
2880
2881		if ((grp == sbi->s_groups_count) &&
2882		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2883			grp = i;
2884
2885		block_bitmap = ext4_block_bitmap(sb, gdp);
2886		if (block_bitmap == sb_block) {
2887			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2888				 "Block bitmap for group %u overlaps "
2889				 "superblock", i);
2890			if (!sb_rdonly(sb))
2891				return 0;
2892		}
2893		if (block_bitmap >= sb_block + 1 &&
2894		    block_bitmap <= last_bg_block) {
2895			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2896				 "Block bitmap for group %u overlaps "
2897				 "block group descriptors", i);
2898			if (!sb_rdonly(sb))
2899				return 0;
2900		}
2901		if (block_bitmap < first_block || block_bitmap > last_block) {
2902			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2903			       "Block bitmap for group %u not in group "
2904			       "(block %llu)!", i, block_bitmap);
2905			return 0;
2906		}
2907		inode_bitmap = ext4_inode_bitmap(sb, gdp);
2908		if (inode_bitmap == sb_block) {
2909			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2910				 "Inode bitmap for group %u overlaps "
2911				 "superblock", i);
2912			if (!sb_rdonly(sb))
2913				return 0;
2914		}
2915		if (inode_bitmap >= sb_block + 1 &&
2916		    inode_bitmap <= last_bg_block) {
2917			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2918				 "Inode bitmap for group %u overlaps "
2919				 "block group descriptors", i);
2920			if (!sb_rdonly(sb))
2921				return 0;
2922		}
2923		if (inode_bitmap < first_block || inode_bitmap > last_block) {
2924			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2925			       "Inode bitmap for group %u not in group "
2926			       "(block %llu)!", i, inode_bitmap);
2927			return 0;
2928		}
2929		inode_table = ext4_inode_table(sb, gdp);
2930		if (inode_table == sb_block) {
2931			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2932				 "Inode table for group %u overlaps "
2933				 "superblock", i);
2934			if (!sb_rdonly(sb))
2935				return 0;
2936		}
2937		if (inode_table >= sb_block + 1 &&
2938		    inode_table <= last_bg_block) {
2939			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2940				 "Inode table for group %u overlaps "
2941				 "block group descriptors", i);
2942			if (!sb_rdonly(sb))
2943				return 0;
2944		}
2945		if (inode_table < first_block ||
2946		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2947			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2948			       "Inode table for group %u not in group "
2949			       "(block %llu)!", i, inode_table);
2950			return 0;
2951		}
2952		ext4_lock_group(sb, i);
2953		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2954			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2955				 "Checksum for group %u failed (%u!=%u)",
2956				 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
2957				     gdp)), le16_to_cpu(gdp->bg_checksum));
2958			if (!sb_rdonly(sb)) {
2959				ext4_unlock_group(sb, i);
2960				return 0;
2961			}
2962		}
2963		ext4_unlock_group(sb, i);
2964		if (!flexbg_flag)
2965			first_block += EXT4_BLOCKS_PER_GROUP(sb);
2966	}
2967	if (NULL != first_not_zeroed)
2968		*first_not_zeroed = grp;
2969	return 1;
2970}
2971
2972/*
2973 * Maximal extent format file size.
2974 * Resulting logical blkno at s_maxbytes must fit in our on-disk
2975 * extent format containers, within a sector_t, and within i_blocks
2976 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2977 * so that won't be a limiting factor.
2978 *
2979 * However there is other limiting factor. We do store extents in the form
2980 * of starting block and length, hence the resulting length of the extent
2981 * covering maximum file size must fit into on-disk format containers as
2982 * well. Given that length is always by 1 unit bigger than max unit (because
2983 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2984 *
2985 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2986 */
2987static loff_t ext4_max_size(int blkbits, int has_huge_files)
2988{
2989	loff_t res;
2990	loff_t upper_limit = MAX_LFS_FILESIZE;
2991
2992	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
2993
2994	if (!has_huge_files) {
2995		upper_limit = (1LL << 32) - 1;
2996
2997		/* total blocks in file system block size */
2998		upper_limit >>= (blkbits - 9);
2999		upper_limit <<= blkbits;
3000	}
3001
3002	/*
3003	 * 32-bit extent-start container, ee_block. We lower the maxbytes
3004	 * by one fs block, so ee_len can cover the extent of maximum file
3005	 * size
3006	 */
3007	res = (1LL << 32) - 1;
3008	res <<= blkbits;
3009
3010	/* Sanity check against vm- & vfs- imposed limits */
3011	if (res > upper_limit)
3012		res = upper_limit;
3013
3014	return res;
3015}
3016
3017/*
3018 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
3019 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
3020 * We need to be 1 filesystem block less than the 2^48 sector limit.
3021 */
3022static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
3023{
3024	loff_t res = EXT4_NDIR_BLOCKS;
3025	int meta_blocks;
3026	loff_t upper_limit;
3027	/* This is calculated to be the largest file size for a dense, block
3028	 * mapped file such that the file's total number of 512-byte sectors,
3029	 * including data and all indirect blocks, does not exceed (2^48 - 1).
3030	 *
3031	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
3032	 * number of 512-byte sectors of the file.
3033	 */
3034
3035	if (!has_huge_files) {
3036		/*
3037		 * !has_huge_files or implies that the inode i_block field
3038		 * represents total file blocks in 2^32 512-byte sectors ==
3039		 * size of vfs inode i_blocks * 8
3040		 */
3041		upper_limit = (1LL << 32) - 1;
3042
3043		/* total blocks in file system block size */
3044		upper_limit >>= (bits - 9);
3045
3046	} else {
3047		/*
3048		 * We use 48 bit ext4_inode i_blocks
3049		 * With EXT4_HUGE_FILE_FL set the i_blocks
3050		 * represent total number of blocks in
3051		 * file system block size
3052		 */
3053		upper_limit = (1LL << 48) - 1;
3054
3055	}
3056
3057	/* indirect blocks */
3058	meta_blocks = 1;
3059	/* double indirect blocks */
3060	meta_blocks += 1 + (1LL << (bits-2));
3061	/* tripple indirect blocks */
3062	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
3063
3064	upper_limit -= meta_blocks;
3065	upper_limit <<= bits;
3066
3067	res += 1LL << (bits-2);
3068	res += 1LL << (2*(bits-2));
3069	res += 1LL << (3*(bits-2));
3070	res <<= bits;
3071	if (res > upper_limit)
3072		res = upper_limit;
3073
3074	if (res > MAX_LFS_FILESIZE)
3075		res = MAX_LFS_FILESIZE;
3076
3077	return res;
3078}
3079
3080static ext4_fsblk_t descriptor_loc(struct super_block *sb,
3081				   ext4_fsblk_t logical_sb_block, int nr)
3082{
3083	struct ext4_sb_info *sbi = EXT4_SB(sb);
3084	ext4_group_t bg, first_meta_bg;
3085	int has_super = 0;
3086
3087	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3088
3089	if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3090		return logical_sb_block + nr + 1;
3091	bg = sbi->s_desc_per_block * nr;
3092	if (ext4_bg_has_super(sb, bg))
3093		has_super = 1;
3094
3095	/*
3096	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3097	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
3098	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3099	 * compensate.
3100	 */
3101	if (sb->s_blocksize == 1024 && nr == 0 &&
3102	    le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3103		has_super++;
3104
3105	return (has_super + ext4_group_first_block_no(sb, bg));
3106}
3107
3108/**
3109 * ext4_get_stripe_size: Get the stripe size.
3110 * @sbi: In memory super block info
3111 *
3112 * If we have specified it via mount option, then
3113 * use the mount option value. If the value specified at mount time is
3114 * greater than the blocks per group use the super block value.
3115 * If the super block value is greater than blocks per group return 0.
3116 * Allocator needs it be less than blocks per group.
3117 *
3118 */
3119static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3120{
3121	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3122	unsigned long stripe_width =
3123			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3124	int ret;
3125
3126	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3127		ret = sbi->s_stripe;
3128	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3129		ret = stripe_width;
3130	else if (stride && stride <= sbi->s_blocks_per_group)
3131		ret = stride;
3132	else
3133		ret = 0;
3134
3135	/*
3136	 * If the stripe width is 1, this makes no sense and
3137	 * we set it to 0 to turn off stripe handling code.
3138	 */
3139	if (ret <= 1)
3140		ret = 0;
3141
3142	return ret;
3143}
3144
3145/*
3146 * Check whether this filesystem can be mounted based on
3147 * the features present and the RDONLY/RDWR mount requested.
3148 * Returns 1 if this filesystem can be mounted as requested,
3149 * 0 if it cannot be.
3150 */
3151int ext4_feature_set_ok(struct super_block *sb, int readonly)
3152{
3153	if (ext4_has_unknown_ext4_incompat_features(sb)) {
3154		ext4_msg(sb, KERN_ERR,
3155			"Couldn't mount because of "
3156			"unsupported optional features (%x)",
3157			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3158			~EXT4_FEATURE_INCOMPAT_SUPP));
3159		return 0;
3160	}
3161
3162#ifndef CONFIG_UNICODE
3163	if (ext4_has_feature_casefold(sb)) {
3164		ext4_msg(sb, KERN_ERR,
3165			 "Filesystem with casefold feature cannot be "
3166			 "mounted without CONFIG_UNICODE");
3167		return 0;
3168	}
3169#endif
3170
3171	if (readonly)
3172		return 1;
3173
3174	if (ext4_has_feature_readonly(sb)) {
3175		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3176		sb->s_flags |= SB_RDONLY;
3177		return 1;
3178	}
3179
3180	/* Check that feature set is OK for a read-write mount */
3181	if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3182		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3183			 "unsupported optional features (%x)",
3184			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3185				~EXT4_FEATURE_RO_COMPAT_SUPP));
3186		return 0;
3187	}
3188	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3189		ext4_msg(sb, KERN_ERR,
3190			 "Can't support bigalloc feature without "
3191			 "extents feature\n");
3192		return 0;
3193	}
3194
3195#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3196	if (!readonly && (ext4_has_feature_quota(sb) ||
3197			  ext4_has_feature_project(sb))) {
3198		ext4_msg(sb, KERN_ERR,
3199			 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3200		return 0;
3201	}
3202#endif  /* CONFIG_QUOTA */
3203	return 1;
3204}
3205
3206/*
3207 * This function is called once a day if we have errors logged
3208 * on the file system
3209 */
3210static void print_daily_error_info(struct timer_list *t)
3211{
3212	struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
3213	struct super_block *sb = sbi->s_sb;
3214	struct ext4_super_block *es = sbi->s_es;
3215
3216	if (es->s_error_count)
3217		/* fsck newer than v1.41.13 is needed to clean this condition. */
3218		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3219			 le32_to_cpu(es->s_error_count));
3220	if (es->s_first_error_time) {
3221		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3222		       sb->s_id,
3223		       ext4_get_tstamp(es, s_first_error_time),
3224		       (int) sizeof(es->s_first_error_func),
3225		       es->s_first_error_func,
3226		       le32_to_cpu(es->s_first_error_line));
3227		if (es->s_first_error_ino)
3228			printk(KERN_CONT ": inode %u",
3229			       le32_to_cpu(es->s_first_error_ino));
3230		if (es->s_first_error_block)
3231			printk(KERN_CONT ": block %llu", (unsigned long long)
3232			       le64_to_cpu(es->s_first_error_block));
3233		printk(KERN_CONT "\n");
3234	}
3235	if (es->s_last_error_time) {
3236		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3237		       sb->s_id,
3238		       ext4_get_tstamp(es, s_last_error_time),
3239		       (int) sizeof(es->s_last_error_func),
3240		       es->s_last_error_func,
3241		       le32_to_cpu(es->s_last_error_line));
3242		if (es->s_last_error_ino)
3243			printk(KERN_CONT ": inode %u",
3244			       le32_to_cpu(es->s_last_error_ino));
3245		if (es->s_last_error_block)
3246			printk(KERN_CONT ": block %llu", (unsigned long long)
3247			       le64_to_cpu(es->s_last_error_block));
3248		printk(KERN_CONT "\n");
3249	}
3250	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
3251}
3252
3253/* Find next suitable group and run ext4_init_inode_table */
3254static int ext4_run_li_request(struct ext4_li_request *elr)
3255{
3256	struct ext4_group_desc *gdp = NULL;
3257	struct super_block *sb = elr->lr_super;
3258	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3259	ext4_group_t group = elr->lr_next_group;
3260	unsigned long timeout = 0;
3261	unsigned int prefetch_ios = 0;
3262	int ret = 0;
3263
3264	if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
3265		elr->lr_next_group = ext4_mb_prefetch(sb, group,
3266				EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
3267		if (prefetch_ios)
3268			ext4_mb_prefetch_fini(sb, elr->lr_next_group,
3269					      prefetch_ios);
3270		trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
3271					    prefetch_ios);
3272		if (group >= elr->lr_next_group) {
3273			ret = 1;
3274			if (elr->lr_first_not_zeroed != ngroups &&
3275			    !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
3276				elr->lr_next_group = elr->lr_first_not_zeroed;
3277				elr->lr_mode = EXT4_LI_MODE_ITABLE;
3278				ret = 0;
3279			}
3280		}
3281		return ret;
3282	}
3283
3284	for (; group < ngroups; group++) {
3285		gdp = ext4_get_group_desc(sb, group, NULL);
3286		if (!gdp) {
3287			ret = 1;
3288			break;
3289		}
3290
3291		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3292			break;
3293	}
3294
3295	if (group >= ngroups)
3296		ret = 1;
3297
3298	if (!ret) {
3299		timeout = jiffies;
3300		ret = ext4_init_inode_table(sb, group,
3301					    elr->lr_timeout ? 0 : 1);
3302		trace_ext4_lazy_itable_init(sb, group);
3303		if (elr->lr_timeout == 0) {
3304			timeout = (jiffies - timeout) *
3305				EXT4_SB(elr->lr_super)->s_li_wait_mult;
3306			elr->lr_timeout = timeout;
3307		}
3308		elr->lr_next_sched = jiffies + elr->lr_timeout;
3309		elr->lr_next_group = group + 1;
3310	}
3311	return ret;
3312}
3313
3314/*
3315 * Remove lr_request from the list_request and free the
3316 * request structure. Should be called with li_list_mtx held
3317 */
3318static void ext4_remove_li_request(struct ext4_li_request *elr)
3319{
3320	if (!elr)
3321		return;
3322
3323	list_del(&elr->lr_request);
3324	EXT4_SB(elr->lr_super)->s_li_request = NULL;
3325	kfree(elr);
3326}
3327
3328static void ext4_unregister_li_request(struct super_block *sb)
3329{
3330	mutex_lock(&ext4_li_mtx);
3331	if (!ext4_li_info) {
3332		mutex_unlock(&ext4_li_mtx);
3333		return;
3334	}
3335
3336	mutex_lock(&ext4_li_info->li_list_mtx);
3337	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3338	mutex_unlock(&ext4_li_info->li_list_mtx);
3339	mutex_unlock(&ext4_li_mtx);
3340}
3341
3342static struct task_struct *ext4_lazyinit_task;
3343
3344/*
3345 * This is the function where ext4lazyinit thread lives. It walks
3346 * through the request list searching for next scheduled filesystem.
3347 * When such a fs is found, run the lazy initialization request
3348 * (ext4_rn_li_request) and keep track of the time spend in this
3349 * function. Based on that time we compute next schedule time of
3350 * the request. When walking through the list is complete, compute
3351 * next waking time and put itself into sleep.
3352 */
3353static int ext4_lazyinit_thread(void *arg)
3354{
3355	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
3356	struct list_head *pos, *n;
3357	struct ext4_li_request *elr;
3358	unsigned long next_wakeup, cur;
3359
3360	BUG_ON(NULL == eli);
3361
3362cont_thread:
3363	while (true) {
3364		next_wakeup = MAX_JIFFY_OFFSET;
3365
3366		mutex_lock(&eli->li_list_mtx);
3367		if (list_empty(&eli->li_request_list)) {
3368			mutex_unlock(&eli->li_list_mtx);
3369			goto exit_thread;
3370		}
3371		list_for_each_safe(pos, n, &eli->li_request_list) {
3372			int err = 0;
3373			int progress = 0;
3374			elr = list_entry(pos, struct ext4_li_request,
3375					 lr_request);
3376
3377			if (time_before(jiffies, elr->lr_next_sched)) {
3378				if (time_before(elr->lr_next_sched, next_wakeup))
3379					next_wakeup = elr->lr_next_sched;
3380				continue;
3381			}
3382			if (down_read_trylock(&elr->lr_super->s_umount)) {
3383				if (sb_start_write_trylock(elr->lr_super)) {
3384					progress = 1;
3385					/*
3386					 * We hold sb->s_umount, sb can not
3387					 * be removed from the list, it is
3388					 * now safe to drop li_list_mtx
3389					 */
3390					mutex_unlock(&eli->li_list_mtx);
3391					err = ext4_run_li_request(elr);
3392					sb_end_write(elr->lr_super);
3393					mutex_lock(&eli->li_list_mtx);
3394					n = pos->next;
3395				}
3396				up_read((&elr->lr_super->s_umount));
3397			}
3398			/* error, remove the lazy_init job */
3399			if (err) {
3400				ext4_remove_li_request(elr);
3401				continue;
3402			}
3403			if (!progress) {
3404				elr->lr_next_sched = jiffies +
3405					(prandom_u32()
3406					 % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3407			}
3408			if (time_before(elr->lr_next_sched, next_wakeup))
3409				next_wakeup = elr->lr_next_sched;
3410		}
3411		mutex_unlock(&eli->li_list_mtx);
3412
3413		try_to_freeze();
3414
3415		cur = jiffies;
3416		if ((time_after_eq(cur, next_wakeup)) ||
3417		    (MAX_JIFFY_OFFSET == next_wakeup)) {
3418			cond_resched();
3419			continue;
3420		}
3421
3422		schedule_timeout_interruptible(next_wakeup - cur);
3423
3424		if (kthread_should_stop()) {
3425			ext4_clear_request_list();
3426			goto exit_thread;
3427		}
3428	}
3429
3430exit_thread:
3431	/*
3432	 * It looks like the request list is empty, but we need
3433	 * to check it under the li_list_mtx lock, to prevent any
3434	 * additions into it, and of course we should lock ext4_li_mtx
3435	 * to atomically free the list and ext4_li_info, because at
3436	 * this point another ext4 filesystem could be registering
3437	 * new one.
3438	 */
3439	mutex_lock(&ext4_li_mtx);
3440	mutex_lock(&eli->li_list_mtx);
3441	if (!list_empty(&eli->li_request_list)) {
3442		mutex_unlock(&eli->li_list_mtx);
3443		mutex_unlock(&ext4_li_mtx);
3444		goto cont_thread;
3445	}
3446	mutex_unlock(&eli->li_list_mtx);
3447	kfree(ext4_li_info);
3448	ext4_li_info = NULL;
3449	mutex_unlock(&ext4_li_mtx);
3450
3451	return 0;
3452}
3453
3454static void ext4_clear_request_list(void)
3455{
3456	struct list_head *pos, *n;
3457	struct ext4_li_request *elr;
3458
3459	mutex_lock(&ext4_li_info->li_list_mtx);
3460	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3461		elr = list_entry(pos, struct ext4_li_request,
3462				 lr_request);
3463		ext4_remove_li_request(elr);
3464	}
3465	mutex_unlock(&ext4_li_info->li_list_mtx);
3466}
3467
3468static int ext4_run_lazyinit_thread(void)
3469{
3470	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3471					 ext4_li_info, "ext4lazyinit");
3472	if (IS_ERR(ext4_lazyinit_task)) {
3473		int err = PTR_ERR(ext4_lazyinit_task);
3474		ext4_clear_request_list();
3475		kfree(ext4_li_info);
3476		ext4_li_info = NULL;
3477		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3478				 "initialization thread\n",
3479				 err);
3480		return err;
3481	}
3482	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3483	return 0;
3484}
3485
3486/*
3487 * Check whether it make sense to run itable init. thread or not.
3488 * If there is at least one uninitialized inode table, return
3489 * corresponding group number, else the loop goes through all
3490 * groups and return total number of groups.
3491 */
3492static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3493{
3494	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3495	struct ext4_group_desc *gdp = NULL;
3496
3497	if (!ext4_has_group_desc_csum(sb))
3498		return ngroups;
3499
3500	for (group = 0; group < ngroups; group++) {
3501		gdp = ext4_get_group_desc(sb, group, NULL);
3502		if (!gdp)
3503			continue;
3504
3505		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3506			break;
3507	}
3508
3509	return group;
3510}
3511
3512static int ext4_li_info_new(void)
3513{
3514	struct ext4_lazy_init *eli = NULL;
3515
3516	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3517	if (!eli)
3518		return -ENOMEM;
3519
3520	INIT_LIST_HEAD(&eli->li_request_list);
3521	mutex_init(&eli->li_list_mtx);
3522
3523	eli->li_state |= EXT4_LAZYINIT_QUIT;
3524
3525	ext4_li_info = eli;
3526
3527	return 0;
3528}
3529
3530static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3531					    ext4_group_t start)
3532{
3533	struct ext4_li_request *elr;
3534
3535	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3536	if (!elr)
3537		return NULL;
3538
3539	elr->lr_super = sb;
3540	elr->lr_first_not_zeroed = start;
3541	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
3542		elr->lr_mode = EXT4_LI_MODE_ITABLE;
3543		elr->lr_next_group = start;
3544	} else {
3545		elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
3546	}
3547
3548	/*
3549	 * Randomize first schedule time of the request to
3550	 * spread the inode table initialization requests
3551	 * better.
3552	 */
3553	elr->lr_next_sched = jiffies + (prandom_u32() %
3554				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
3555	return elr;
3556}
3557
3558int ext4_register_li_request(struct super_block *sb,
3559			     ext4_group_t first_not_zeroed)
3560{
3561	struct ext4_sb_info *sbi = EXT4_SB(sb);
3562	struct ext4_li_request *elr = NULL;
3563	ext4_group_t ngroups = sbi->s_groups_count;
3564	int ret = 0;
3565
3566	mutex_lock(&ext4_li_mtx);
3567	if (sbi->s_li_request != NULL) {
3568		/*
3569		 * Reset timeout so it can be computed again, because
3570		 * s_li_wait_mult might have changed.
3571		 */
3572		sbi->s_li_request->lr_timeout = 0;
3573		goto out;
3574	}
3575
3576	if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
3577	    (first_not_zeroed == ngroups || sb_rdonly(sb) ||
3578	     !test_opt(sb, INIT_INODE_TABLE)))
3579		goto out;
3580
3581	elr = ext4_li_request_new(sb, first_not_zeroed);
3582	if (!elr) {
3583		ret = -ENOMEM;
3584		goto out;
3585	}
3586
3587	if (NULL == ext4_li_info) {
3588		ret = ext4_li_info_new();
3589		if (ret)
3590			goto out;
3591	}
3592
3593	mutex_lock(&ext4_li_info->li_list_mtx);
3594	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3595	mutex_unlock(&ext4_li_info->li_list_mtx);
3596
3597	sbi->s_li_request = elr;
3598	/*
3599	 * set elr to NULL here since it has been inserted to
3600	 * the request_list and the removal and free of it is
3601	 * handled by ext4_clear_request_list from now on.
3602	 */
3603	elr = NULL;
3604
3605	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3606		ret = ext4_run_lazyinit_thread();
3607		if (ret)
3608			goto out;
3609	}
3610out:
3611	mutex_unlock(&ext4_li_mtx);
3612	if (ret)
3613		kfree(elr);
3614	return ret;
3615}
3616
3617/*
3618 * We do not need to lock anything since this is called on
3619 * module unload.
3620 */
3621static void ext4_destroy_lazyinit_thread(void)
3622{
3623	/*
3624	 * If thread exited earlier
3625	 * there's nothing to be done.
3626	 */
3627	if (!ext4_li_info || !ext4_lazyinit_task)
3628		return;
3629
3630	kthread_stop(ext4_lazyinit_task);
3631}
3632
3633static int set_journal_csum_feature_set(struct super_block *sb)
3634{
3635	int ret = 1;
3636	int compat, incompat;
3637	struct ext4_sb_info *sbi = EXT4_SB(sb);
3638
3639	if (ext4_has_metadata_csum(sb)) {
3640		/* journal checksum v3 */
3641		compat = 0;
3642		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3643	} else {
3644		/* journal checksum v1 */
3645		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3646		incompat = 0;
3647	}
3648
3649	jbd2_journal_clear_features(sbi->s_journal,
3650			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3651			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3652			JBD2_FEATURE_INCOMPAT_CSUM_V2);
3653	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3654		ret = jbd2_journal_set_features(sbi->s_journal,
3655				compat, 0,
3656				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3657				incompat);
3658	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3659		ret = jbd2_journal_set_features(sbi->s_journal,
3660				compat, 0,
3661				incompat);
3662		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3663				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3664	} else {
3665		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3666				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3667	}
3668
3669	return ret;
3670}
3671
3672/*
3673 * Note: calculating the overhead so we can be compatible with
3674 * historical BSD practice is quite difficult in the face of
3675 * clusters/bigalloc.  This is because multiple metadata blocks from
3676 * different block group can end up in the same allocation cluster.
3677 * Calculating the exact overhead in the face of clustered allocation
3678 * requires either O(all block bitmaps) in memory or O(number of block
3679 * groups**2) in time.  We will still calculate the superblock for
3680 * older file systems --- and if we come across with a bigalloc file
3681 * system with zero in s_overhead_clusters the estimate will be close to
3682 * correct especially for very large cluster sizes --- but for newer
3683 * file systems, it's better to calculate this figure once at mkfs
3684 * time, and store it in the superblock.  If the superblock value is
3685 * present (even for non-bigalloc file systems), we will use it.
3686 */
3687static int count_overhead(struct super_block *sb, ext4_group_t grp,
3688			  char *buf)
3689{
3690	struct ext4_sb_info	*sbi = EXT4_SB(sb);
3691	struct ext4_group_desc	*gdp;
3692	ext4_fsblk_t		first_block, last_block, b;
3693	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
3694	int			s, j, count = 0;
3695
3696	if (!ext4_has_feature_bigalloc(sb))
3697		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3698			sbi->s_itb_per_group + 2);
3699
3700	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3701		(grp * EXT4_BLOCKS_PER_GROUP(sb));
3702	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3703	for (i = 0; i < ngroups; i++) {
3704		gdp = ext4_get_group_desc(sb, i, NULL);
3705		b = ext4_block_bitmap(sb, gdp);
3706		if (b >= first_block && b <= last_block) {
3707			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3708			count++;
3709		}
3710		b = ext4_inode_bitmap(sb, gdp);
3711		if (b >= first_block && b <= last_block) {
3712			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3713			count++;
3714		}
3715		b = ext4_inode_table(sb, gdp);
3716		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3717			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3718				int c = EXT4_B2C(sbi, b - first_block);
3719				ext4_set_bit(c, buf);
3720				count++;
3721			}
3722		if (i != grp)
3723			continue;
3724		s = 0;
3725		if (ext4_bg_has_super(sb, grp)) {
3726			ext4_set_bit(s++, buf);
3727			count++;
3728		}
3729		j = ext4_bg_num_gdb(sb, grp);
3730		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
3731			ext4_error(sb, "Invalid number of block group "
3732				   "descriptor blocks: %d", j);
3733			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
3734		}
3735		count += j;
3736		for (; j > 0; j--)
3737			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3738	}
3739	if (!count)
3740		return 0;
3741	return EXT4_CLUSTERS_PER_GROUP(sb) -
3742		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3743}
3744
3745/*
3746 * Compute the overhead and stash it in sbi->s_overhead
3747 */
3748int ext4_calculate_overhead(struct super_block *sb)
3749{
3750	struct ext4_sb_info *sbi = EXT4_SB(sb);
3751	struct ext4_super_block *es = sbi->s_es;
3752	struct inode *j_inode;
3753	unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
3754	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3755	ext4_fsblk_t overhead = 0;
3756	char *buf = (char *) get_zeroed_page(GFP_NOFS);
3757
3758	if (!buf)
3759		return -ENOMEM;
3760
3761	/*
3762	 * Compute the overhead (FS structures).  This is constant
3763	 * for a given filesystem unless the number of block groups
3764	 * changes so we cache the previous value until it does.
3765	 */
3766
3767	/*
3768	 * All of the blocks before first_data_block are overhead
3769	 */
3770	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3771
3772	/*
3773	 * Add the overhead found in each block group
3774	 */
3775	for (i = 0; i < ngroups; i++) {
3776		int blks;
3777
3778		blks = count_overhead(sb, i, buf);
3779		overhead += blks;
3780		if (blks)
3781			memset(buf, 0, PAGE_SIZE);
3782		cond_resched();
3783	}
3784
3785	/*
3786	 * Add the internal journal blocks whether the journal has been
3787	 * loaded or not
3788	 */
3789	if (sbi->s_journal && !sbi->s_journal_bdev)
3790		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
3791	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
3792		/* j_inum for internal journal is non-zero */
3793		j_inode = ext4_get_journal_inode(sb, j_inum);
3794		if (j_inode) {
3795			j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
3796			overhead += EXT4_NUM_B2C(sbi, j_blocks);
3797			iput(j_inode);
3798		} else {
3799			ext4_msg(sb, KERN_ERR, "can't get journal size");
3800		}
3801	}
3802	sbi->s_overhead = overhead;
3803	smp_wmb();
3804	free_page((unsigned long) buf);
3805	return 0;
3806}
3807
3808static void ext4_set_resv_clusters(struct super_block *sb)
3809{
3810	ext4_fsblk_t resv_clusters;
3811	struct ext4_sb_info *sbi = EXT4_SB(sb);
3812
3813	/*
3814	 * There's no need to reserve anything when we aren't using extents.
3815	 * The space estimates are exact, there are no unwritten extents,
3816	 * hole punching doesn't need new metadata... This is needed especially
3817	 * to keep ext2/3 backward compatibility.
3818	 */
3819	if (!ext4_has_feature_extents(sb))
3820		return;
3821	/*
3822	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3823	 * This should cover the situations where we can not afford to run
3824	 * out of space like for example punch hole, or converting
3825	 * unwritten extents in delalloc path. In most cases such
3826	 * allocation would require 1, or 2 blocks, higher numbers are
3827	 * very rare.
3828	 */
3829	resv_clusters = (ext4_blocks_count(sbi->s_es) >>
3830			 sbi->s_cluster_bits);
3831
3832	do_div(resv_clusters, 50);
3833	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3834
3835	atomic64_set(&sbi->s_resv_clusters, resv_clusters);
3836}
3837
3838static const char *ext4_quota_mode(struct super_block *sb)
3839{
3840#ifdef CONFIG_QUOTA
3841	if (!ext4_quota_capable(sb))
3842		return "none";
3843
3844	if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
3845		return "journalled";
3846	else
3847		return "writeback";
3848#else
3849	return "disabled";
3850#endif
3851}
3852
3853static void ext4_setup_csum_trigger(struct super_block *sb,
3854				    enum ext4_journal_trigger_type type,
3855				    void (*trigger)(
3856					struct jbd2_buffer_trigger_type *type,
3857					struct buffer_head *bh,
3858					void *mapped_data,
3859					size_t size))
3860{
3861	struct ext4_sb_info *sbi = EXT4_SB(sb);
3862
3863	sbi->s_journal_triggers[type].sb = sb;
3864	sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
3865}
3866
3867static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3868{
3869	struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
3870	char *orig_data = kstrdup(data, GFP_KERNEL);
3871	struct buffer_head *bh, **group_desc;
3872	struct ext4_super_block *es = NULL;
3873	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3874	struct flex_groups **flex_groups;
3875	ext4_fsblk_t block;
3876	ext4_fsblk_t sb_block = get_sb_block(&data);
3877	ext4_fsblk_t logical_sb_block;
3878	unsigned long offset = 0;
3879	unsigned long def_mount_opts;
3880	struct inode *root;
3881	const char *descr;
3882	int ret = -ENOMEM;
3883	int blocksize, clustersize;
3884	unsigned int db_count;
3885	unsigned int i;
3886	int needs_recovery, has_huge_files;
3887	__u64 blocks_count;
3888	int err = 0;
3889	ext4_group_t first_not_zeroed;
3890	struct ext4_parsed_options parsed_opts;
3891
3892	/* Set defaults for the variables that will be set during parsing */
3893	parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3894	parsed_opts.journal_devnum = 0;
3895	parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
3896
3897	if ((data && !orig_data) || !sbi)
3898		goto out_free_base;
3899
3900	sbi->s_daxdev = dax_dev;
3901	sbi->s_blockgroup_lock =
3902		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3903	if (!sbi->s_blockgroup_lock)
3904		goto out_free_base;
3905
3906	sb->s_fs_info = sbi;
3907	sbi->s_sb = sb;
3908	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3909	sbi->s_sb_block = sb_block;
3910	sbi->s_sectors_written_start =
3911		part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
3912
3913	/* Cleanup superblock name */
3914	strreplace(sb->s_id, '/', '!');
3915
3916	/* -EINVAL is default */
3917	ret = -EINVAL;
3918	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3919	if (!blocksize) {
3920		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3921		goto out_fail;
3922	}
3923
3924	/*
3925	 * The ext4 superblock will not be buffer aligned for other than 1kB
3926	 * block sizes.  We need to calculate the offset from buffer start.
3927	 */
3928	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3929		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3930		offset = do_div(logical_sb_block, blocksize);
3931	} else {
3932		logical_sb_block = sb_block;
3933	}
3934
3935	bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
3936	if (IS_ERR(bh)) {
3937		ext4_msg(sb, KERN_ERR, "unable to read superblock");
3938		ret = PTR_ERR(bh);
3939		goto out_fail;
3940	}
3941	/*
3942	 * Note: s_es must be initialized as soon as possible because
3943	 *       some ext4 macro-instructions depend on its value
3944	 */
3945	es = (struct ext4_super_block *) (bh->b_data + offset);
3946	sbi->s_es = es;
3947	sb->s_magic = le16_to_cpu(es->s_magic);
3948	if (sb->s_magic != EXT4_SUPER_MAGIC)
3949		goto cantfind_ext4;
3950	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3951
3952	/* Warn if metadata_csum and gdt_csum are both set. */
3953	if (ext4_has_feature_metadata_csum(sb) &&
3954	    ext4_has_feature_gdt_csum(sb))
3955		ext4_warning(sb, "metadata_csum and uninit_bg are "
3956			     "redundant flags; please run fsck.");
3957
3958	/* Check for a known checksum algorithm */
3959	if (!ext4_verify_csum_type(sb, es)) {
3960		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3961			 "unknown checksum algorithm.");
3962		silent = 1;
3963		goto cantfind_ext4;
3964	}
3965	ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
3966				ext4_orphan_file_block_trigger);
3967
3968	/* Load the checksum driver */
3969	sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3970	if (IS_ERR(sbi->s_chksum_driver)) {
3971		ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3972		ret = PTR_ERR(sbi->s_chksum_driver);
3973		sbi->s_chksum_driver = NULL;
3974		goto failed_mount;
3975	}
3976
3977	/* Check superblock checksum */
3978	if (!ext4_superblock_csum_verify(sb, es)) {
3979		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3980			 "invalid superblock checksum.  Run e2fsck?");
3981		silent = 1;
3982		ret = -EFSBADCRC;
3983		goto cantfind_ext4;
3984	}
3985
3986	/* Precompute checksum seed for all metadata */
3987	if (ext4_has_feature_csum_seed(sb))
3988		sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
3989	else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
3990		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3991					       sizeof(es->s_uuid));
3992
3993	/* Set defaults before we parse the mount options */
3994	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3995	set_opt(sb, INIT_INODE_TABLE);
3996	if (def_mount_opts & EXT4_DEFM_DEBUG)
3997		set_opt(sb, DEBUG);
3998	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3999		set_opt(sb, GRPID);
4000	if (def_mount_opts & EXT4_DEFM_UID16)
4001		set_opt(sb, NO_UID32);
4002	/* xattr user namespace & acls are now defaulted on */
4003	set_opt(sb, XATTR_USER);
4004#ifdef CONFIG_EXT4_FS_POSIX_ACL
4005	set_opt(sb, POSIX_ACL);
4006#endif
4007	if (ext4_has_feature_fast_commit(sb))
4008		set_opt2(sb, JOURNAL_FAST_COMMIT);
4009	/* don't forget to enable journal_csum when metadata_csum is enabled. */
4010	if (ext4_has_metadata_csum(sb))
4011		set_opt(sb, JOURNAL_CHECKSUM);
4012
4013	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
4014		set_opt(sb, JOURNAL_DATA);
4015	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
4016		set_opt(sb, ORDERED_DATA);
4017	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
4018		set_opt(sb, WRITEBACK_DATA);
4019
4020	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
4021		set_opt(sb, ERRORS_PANIC);
4022	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
4023		set_opt(sb, ERRORS_CONT);
4024	else
4025		set_opt(sb, ERRORS_RO);
4026	/* block_validity enabled by default; disable with noblock_validity */
4027	set_opt(sb, BLOCK_VALIDITY);
4028	if (def_mount_opts & EXT4_DEFM_DISCARD)
4029		set_opt(sb, DISCARD);
4030
4031	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
4032	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
4033	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
4034	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
4035	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
4036
4037	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
4038		set_opt(sb, BARRIER);
4039
4040	/*
4041	 * enable delayed allocation by default
4042	 * Use -o nodelalloc to turn it off
4043	 */
4044	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
4045	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
4046		set_opt(sb, DELALLOC);
4047
4048	/*
4049	 * set default s_li_wait_mult for lazyinit, for the case there is
4050	 * no mount option specified.
4051	 */
4052	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
4053
4054	if (le32_to_cpu(es->s_log_block_size) >
4055	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4056		ext4_msg(sb, KERN_ERR,
4057			 "Invalid log block size: %u",
4058			 le32_to_cpu(es->s_log_block_size));
4059		goto failed_mount;
4060	}
4061	if (le32_to_cpu(es->s_log_cluster_size) >
4062	    (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4063		ext4_msg(sb, KERN_ERR,
4064			 "Invalid log cluster size: %u",
4065			 le32_to_cpu(es->s_log_cluster_size));
4066		goto failed_mount;
4067	}
4068
4069	blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
4070
4071	if (blocksize == PAGE_SIZE)
4072		set_opt(sb, DIOREAD_NOLOCK);
4073
4074	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
4075		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
4076		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
4077	} else {
4078		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
4079		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
4080		if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
4081			ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
4082				 sbi->s_first_ino);
4083			goto failed_mount;
4084		}
4085		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
4086		    (!is_power_of_2(sbi->s_inode_size)) ||
4087		    (sbi->s_inode_size > blocksize)) {
4088			ext4_msg(sb, KERN_ERR,
4089			       "unsupported inode size: %d",
4090			       sbi->s_inode_size);
4091			ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
4092			goto failed_mount;
4093		}
4094		/*
4095		 * i_atime_extra is the last extra field available for
4096		 * [acm]times in struct ext4_inode. Checking for that
4097		 * field should suffice to ensure we have extra space
4098		 * for all three.
4099		 */
4100		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
4101			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
4102			sb->s_time_gran = 1;
4103			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
4104		} else {
4105			sb->s_time_gran = NSEC_PER_SEC;
4106			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
4107		}
4108		sb->s_time_min = EXT4_TIMESTAMP_MIN;
4109	}
4110	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4111		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4112			EXT4_GOOD_OLD_INODE_SIZE;
4113		if (ext4_has_feature_extra_isize(sb)) {
4114			unsigned v, max = (sbi->s_inode_size -
4115					   EXT4_GOOD_OLD_INODE_SIZE);
4116
4117			v = le16_to_cpu(es->s_want_extra_isize);
4118			if (v > max) {
4119				ext4_msg(sb, KERN_ERR,
4120					 "bad s_want_extra_isize: %d", v);
4121				goto failed_mount;
4122			}
4123			if (sbi->s_want_extra_isize < v)
4124				sbi->s_want_extra_isize = v;
4125
4126			v = le16_to_cpu(es->s_min_extra_isize);
4127			if (v > max) {
4128				ext4_msg(sb, KERN_ERR,
4129					 "bad s_min_extra_isize: %d", v);
4130				goto failed_mount;
4131			}
4132			if (sbi->s_want_extra_isize < v)
4133				sbi->s_want_extra_isize = v;
4134		}
4135	}
4136
4137	if (sbi->s_es->s_mount_opts[0]) {
4138		char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
4139					      sizeof(sbi->s_es->s_mount_opts),
4140					      GFP_KERNEL);
4141		if (!s_mount_opts)
4142			goto failed_mount;
4143		if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) {
4144			ext4_msg(sb, KERN_WARNING,
4145				 "failed to parse options in superblock: %s",
4146				 s_mount_opts);
4147		}
4148		kfree(s_mount_opts);
4149	}
4150	sbi->s_def_mount_opt = sbi->s_mount_opt;
4151	if (!parse_options((char *) data, sb, &parsed_opts, 0))
4152		goto failed_mount;
4153
4154#ifdef CONFIG_UNICODE
4155	if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
4156		const struct ext4_sb_encodings *encoding_info;
4157		struct unicode_map *encoding;
4158		__u16 encoding_flags;
4159
4160		if (ext4_sb_read_encoding(es, &encoding_info,
4161					  &encoding_flags)) {
4162			ext4_msg(sb, KERN_ERR,
4163				 "Encoding requested by superblock is unknown");
4164			goto failed_mount;
4165		}
4166
4167		encoding = utf8_load(encoding_info->version);
4168		if (IS_ERR(encoding)) {
4169			ext4_msg(sb, KERN_ERR,
4170				 "can't mount with superblock charset: %s-%s "
4171				 "not supported by the kernel. flags: 0x%x.",
4172				 encoding_info->name, encoding_info->version,
4173				 encoding_flags);
4174			goto failed_mount;
4175		}
4176		ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4177			 "%s-%s with flags 0x%hx", encoding_info->name,
4178			 encoding_info->version?:"\b", encoding_flags);
4179
4180		sb->s_encoding = encoding;
4181		sb->s_encoding_flags = encoding_flags;
4182	}
4183#endif
4184
4185	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4186		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
4187		/* can't mount with both data=journal and dioread_nolock. */
4188		clear_opt(sb, DIOREAD_NOLOCK);
4189		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4190		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4191			ext4_msg(sb, KERN_ERR, "can't mount with "
4192				 "both data=journal and delalloc");
4193			goto failed_mount;
4194		}
4195		if (test_opt(sb, DAX_ALWAYS)) {
4196			ext4_msg(sb, KERN_ERR, "can't mount with "
4197				 "both data=journal and dax");
4198			goto failed_mount;
4199		}
4200		if (ext4_has_feature_encrypt(sb)) {
4201			ext4_msg(sb, KERN_WARNING,
4202				 "encrypted files will use data=ordered "
4203				 "instead of data journaling mode");
4204		}
4205		if (test_opt(sb, DELALLOC))
4206			clear_opt(sb, DELALLOC);
4207	} else {
4208		sb->s_iflags |= SB_I_CGROUPWB;
4209	}
4210
4211	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
4212		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
4213
4214	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4215	    (ext4_has_compat_features(sb) ||
4216	     ext4_has_ro_compat_features(sb) ||
4217	     ext4_has_incompat_features(sb)))
4218		ext4_msg(sb, KERN_WARNING,
4219		       "feature flags set on rev 0 fs, "
4220		       "running e2fsck is recommended");
4221
4222	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4223		set_opt2(sb, HURD_COMPAT);
4224		if (ext4_has_feature_64bit(sb)) {
4225			ext4_msg(sb, KERN_ERR,
4226				 "The Hurd can't support 64-bit file systems");
4227			goto failed_mount;
4228		}
4229
4230		/*
4231		 * ea_inode feature uses l_i_version field which is not
4232		 * available in HURD_COMPAT mode.
4233		 */
4234		if (ext4_has_feature_ea_inode(sb)) {
4235			ext4_msg(sb, KERN_ERR,
4236				 "ea_inode feature is not supported for Hurd");
4237			goto failed_mount;
4238		}
4239	}
4240
4241	if (IS_EXT2_SB(sb)) {
4242		if (ext2_feature_set_ok(sb))
4243			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4244				 "using the ext4 subsystem");
4245		else {
4246			/*
4247			 * If we're probing be silent, if this looks like
4248			 * it's actually an ext[34] filesystem.
4249			 */
4250			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4251				goto failed_mount;
4252			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4253				 "to feature incompatibilities");
4254			goto failed_mount;
4255		}
4256	}
4257
4258	if (IS_EXT3_SB(sb)) {
4259		if (ext3_feature_set_ok(sb))
4260			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4261				 "using the ext4 subsystem");
4262		else {
4263			/*
4264			 * If we're probing be silent, if this looks like
4265			 * it's actually an ext4 filesystem.
4266			 */
4267			if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4268				goto failed_mount;
4269			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4270				 "to feature incompatibilities");
4271			goto failed_mount;
4272		}
4273	}
4274
4275	/*
4276	 * Check feature flags regardless of the revision level, since we
4277	 * previously didn't change the revision level when setting the flags,
4278	 * so there is a chance incompat flags are set on a rev 0 filesystem.
4279	 */
4280	if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4281		goto failed_mount;
4282
4283	if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
4284		ext4_msg(sb, KERN_ERR,
4285			 "Number of reserved GDT blocks insanely large: %d",
4286			 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4287		goto failed_mount;
4288	}
4289
4290	if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
4291			bdev_nr_sectors(sb->s_bdev)))
4292		set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4293
4294	if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4295		if (ext4_has_feature_inline_data(sb)) {
4296			ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4297					" that may contain inline data");
4298			goto failed_mount;
4299		}
4300		if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4301			ext4_msg(sb, KERN_ERR,
4302				"DAX unsupported by block device.");
4303			goto failed_mount;
4304		}
4305	}
4306
4307	if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4308		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4309			 es->s_encryption_level);
4310		goto failed_mount;
4311	}
4312
4313	if (sb->s_blocksize != blocksize) {
4314		/*
4315		 * bh must be released before kill_bdev(), otherwise
4316		 * it won't be freed and its page also. kill_bdev()
4317		 * is called by sb_set_blocksize().
4318		 */
4319		brelse(bh);
4320		/* Validate the filesystem blocksize */
4321		if (!sb_set_blocksize(sb, blocksize)) {
4322			ext4_msg(sb, KERN_ERR, "bad block size %d",
4323					blocksize);
4324			bh = NULL;
4325			goto failed_mount;
4326		}
4327
4328		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
4329		offset = do_div(logical_sb_block, blocksize);
4330		bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
4331		if (IS_ERR(bh)) {
4332			ext4_msg(sb, KERN_ERR,
4333			       "Can't read superblock on 2nd try");
4334			ret = PTR_ERR(bh);
4335			bh = NULL;
4336			goto failed_mount;
4337		}
4338		es = (struct ext4_super_block *)(bh->b_data + offset);
4339		sbi->s_es = es;
4340		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
4341			ext4_msg(sb, KERN_ERR,
4342			       "Magic mismatch, very weird!");
4343			goto failed_mount;
4344		}
4345	}
4346
4347	has_huge_files = ext4_has_feature_huge_file(sb);
4348	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
4349						      has_huge_files);
4350	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
4351
4352	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
4353	if (ext4_has_feature_64bit(sb)) {
4354		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
4355		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
4356		    !is_power_of_2(sbi->s_desc_size)) {
4357			ext4_msg(sb, KERN_ERR,
4358			       "unsupported descriptor size %lu",
4359			       sbi->s_desc_size);
4360			goto failed_mount;
4361		}
4362	} else
4363		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
4364
4365	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
4366	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
4367
4368	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
4369	if (sbi->s_inodes_per_block == 0)
4370		goto cantfind_ext4;
4371	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
4372	    sbi->s_inodes_per_group > blocksize * 8) {
4373		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
4374			 sbi->s_inodes_per_group);
4375		goto failed_mount;
4376	}
4377	sbi->s_itb_per_group = sbi->s_inodes_per_group /
4378					sbi->s_inodes_per_block;
4379	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
4380	sbi->s_sbh = bh;
4381	sbi->s_mount_state = le16_to_cpu(es->s_state);
4382	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
4383	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
4384
4385	for (i = 0; i < 4; i++)
4386		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
4387	sbi->s_def_hash_version = es->s_def_hash_version;
4388	if (ext4_has_feature_dir_index(sb)) {
4389		i = le32_to_cpu(es->s_flags);
4390		if (i & EXT2_FLAGS_UNSIGNED_HASH)
4391			sbi->s_hash_unsigned = 3;
4392		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
4393#ifdef __CHAR_UNSIGNED__
4394			if (!sb_rdonly(sb))
4395				es->s_flags |=
4396					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
4397			sbi->s_hash_unsigned = 3;
4398#else
4399			if (!sb_rdonly(sb))
4400				es->s_flags |=
4401					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
4402#endif
4403		}
4404	}
4405
4406	/* Handle clustersize */
4407	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4408	if (ext4_has_feature_bigalloc(sb)) {
4409		if (clustersize < blocksize) {
4410			ext4_msg(sb, KERN_ERR,
4411				 "cluster size (%d) smaller than "
4412				 "block size (%d)", clustersize, blocksize);
4413			goto failed_mount;
4414		}
4415		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4416			le32_to_cpu(es->s_log_block_size);
4417		sbi->s_clusters_per_group =
4418			le32_to_cpu(es->s_clusters_per_group);
4419		if (sbi->s_clusters_per_group > blocksize * 8) {
4420			ext4_msg(sb, KERN_ERR,
4421				 "#clusters per group too big: %lu",
4422				 sbi->s_clusters_per_group);
4423			goto failed_mount;
4424		}
4425		if (sbi->s_blocks_per_group !=
4426		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
4427			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4428				 "clusters per group (%lu) inconsistent",
4429				 sbi->s_blocks_per_group,
4430				 sbi->s_clusters_per_group);
4431			goto failed_mount;
4432		}
4433	} else {
4434		if (clustersize != blocksize) {
4435			ext4_msg(sb, KERN_ERR,
4436				 "fragment/cluster size (%d) != "
4437				 "block size (%d)", clustersize, blocksize);
4438			goto failed_mount;
4439		}
4440		if (sbi->s_blocks_per_group > blocksize * 8) {
4441			ext4_msg(sb, KERN_ERR,
4442				 "#blocks per group too big: %lu",
4443				 sbi->s_blocks_per_group);
4444			goto failed_mount;
4445		}
4446		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4447		sbi->s_cluster_bits = 0;
4448	}
4449	sbi->s_cluster_ratio = clustersize / blocksize;
4450
4451	/* Do we have standard group size of clustersize * 8 blocks ? */
4452	if (sbi->s_blocks_per_group == clustersize << 3)
4453		set_opt2(sb, STD_GROUP_SIZE);
4454
4455	/*
4456	 * Test whether we have more sectors than will fit in sector_t,
4457	 * and whether the max offset is addressable by the page cache.
4458	 */
4459	err = generic_check_addressable(sb->s_blocksize_bits,
4460					ext4_blocks_count(es));
4461	if (err) {
4462		ext4_msg(sb, KERN_ERR, "filesystem"
4463			 " too large to mount safely on this system");
4464		goto failed_mount;
4465	}
4466
4467	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
4468		goto cantfind_ext4;
4469
4470	/* check blocks count against device size */
4471	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
4472	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4473		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4474		       "exceeds size of device (%llu blocks)",
4475		       ext4_blocks_count(es), blocks_count);
4476		goto failed_mount;
4477	}
4478
4479	/*
4480	 * It makes no sense for the first data block to be beyond the end
4481	 * of the filesystem.
4482	 */
4483	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4484		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4485			 "block %u is beyond end of filesystem (%llu)",
4486			 le32_to_cpu(es->s_first_data_block),
4487			 ext4_blocks_count(es));
4488		goto failed_mount;
4489	}
4490	if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4491	    (sbi->s_cluster_ratio == 1)) {
4492		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4493			 "block is 0 with a 1k block and cluster size");
4494		goto failed_mount;
4495	}
4496
4497	blocks_count = (ext4_blocks_count(es) -
4498			le32_to_cpu(es->s_first_data_block) +
4499			EXT4_BLOCKS_PER_GROUP(sb) - 1);
4500	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4501	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4502		ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4503		       "(block count %llu, first data block %u, "
4504		       "blocks per group %lu)", blocks_count,
4505		       ext4_blocks_count(es),
4506		       le32_to_cpu(es->s_first_data_block),
4507		       EXT4_BLOCKS_PER_GROUP(sb));
4508		goto failed_mount;
4509	}
4510	sbi->s_groups_count = blocks_count;
4511	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4512			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4513	if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4514	    le32_to_cpu(es->s_inodes_count)) {
4515		ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4516			 le32_to_cpu(es->s_inodes_count),
4517			 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4518		ret = -EINVAL;
4519		goto failed_mount;
4520	}
4521	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4522		   EXT4_DESC_PER_BLOCK(sb);
4523	if (ext4_has_feature_meta_bg(sb)) {
4524		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4525			ext4_msg(sb, KERN_WARNING,
4526				 "first meta block group too large: %u "
4527				 "(group descriptor block count %u)",
4528				 le32_to_cpu(es->s_first_meta_bg), db_count);
4529			goto failed_mount;
4530		}
4531	}
4532	rcu_assign_pointer(sbi->s_group_desc,
4533			   kvmalloc_array(db_count,
4534					  sizeof(struct buffer_head *),
4535					  GFP_KERNEL));
4536	if (sbi->s_group_desc == NULL) {
4537		ext4_msg(sb, KERN_ERR, "not enough memory");
4538		ret = -ENOMEM;
4539		goto failed_mount;
4540	}
4541
4542	bgl_lock_init(sbi->s_blockgroup_lock);
4543
4544	/* Pre-read the descriptors into the buffer cache */
4545	for (i = 0; i < db_count; i++) {
4546		block = descriptor_loc(sb, logical_sb_block, i);
4547		ext4_sb_breadahead_unmovable(sb, block);
4548	}
4549
4550	for (i = 0; i < db_count; i++) {
4551		struct buffer_head *bh;
4552
4553		block = descriptor_loc(sb, logical_sb_block, i);
4554		bh = ext4_sb_bread_unmovable(sb, block);
4555		if (IS_ERR(bh)) {
4556			ext4_msg(sb, KERN_ERR,
4557			       "can't read group descriptor %d", i);
4558			db_count = i;
4559			ret = PTR_ERR(bh);
4560			goto failed_mount2;
4561		}
4562		rcu_read_lock();
4563		rcu_dereference(sbi->s_group_desc)[i] = bh;
4564		rcu_read_unlock();
4565	}
4566	sbi->s_gdb_count = db_count;
4567	if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
4568		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4569		ret = -EFSCORRUPTED;
4570		goto failed_mount2;
4571	}
4572
4573	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
4574	spin_lock_init(&sbi->s_error_lock);
4575	INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
4576
4577	/* Register extent status tree shrinker */
4578	if (ext4_es_register_shrinker(sbi))
4579		goto failed_mount3;
4580
4581	sbi->s_stripe = ext4_get_stripe_size(sbi);
4582	sbi->s_extent_max_zeroout_kb = 32;
4583
4584	/*
4585	 * set up enough so that it can read an inode
4586	 */
4587	sb->s_op = &ext4_sops;
4588	sb->s_export_op = &ext4_export_ops;
4589	sb->s_xattr = ext4_xattr_handlers;
4590#ifdef CONFIG_FS_ENCRYPTION
4591	sb->s_cop = &ext4_cryptops;
4592#endif
4593#ifdef CONFIG_FS_VERITY
4594	sb->s_vop = &ext4_verityops;
4595#endif
4596#ifdef CONFIG_QUOTA
4597	sb->dq_op = &ext4_quota_operations;
4598	if (ext4_has_feature_quota(sb))
4599		sb->s_qcop = &dquot_quotactl_sysfile_ops;
4600	else
4601		sb->s_qcop = &ext4_qctl_operations;
4602	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
4603#endif
4604	memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
4605
4606	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
4607	mutex_init(&sbi->s_orphan_lock);
4608
4609	/* Initialize fast commit stuff */
4610	atomic_set(&sbi->s_fc_subtid, 0);
4611	atomic_set(&sbi->s_fc_ineligible_updates, 0);
4612	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
4613	INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
4614	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
4615	INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
4616	sbi->s_fc_bytes = 0;
4617	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
4618	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
4619	spin_lock_init(&sbi->s_fc_lock);
4620	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
4621	sbi->s_fc_replay_state.fc_regions = NULL;
4622	sbi->s_fc_replay_state.fc_regions_size = 0;
4623	sbi->s_fc_replay_state.fc_regions_used = 0;
4624	sbi->s_fc_replay_state.fc_regions_valid = 0;
4625	sbi->s_fc_replay_state.fc_modified_inodes = NULL;
4626	sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
4627	sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
4628
4629	sb->s_root = NULL;
4630
4631	needs_recovery = (es->s_last_orphan != 0 ||
4632			  ext4_has_feature_orphan_present(sb) ||
4633			  ext4_has_feature_journal_needs_recovery(sb));
4634
4635	if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
4636		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
4637			goto failed_mount3a;
4638
4639	/*
4640	 * The first inode we look at is the journal inode.  Don't try
4641	 * root first: it may be modified in the journal!
4642	 */
4643	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
4644		err = ext4_load_journal(sb, es, parsed_opts.journal_devnum);
4645		if (err)
4646			goto failed_mount3a;
4647	} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
4648		   ext4_has_feature_journal_needs_recovery(sb)) {
4649		ext4_msg(sb, KERN_ERR, "required journal recovery "
4650		       "suppressed and not mounted read-only");
4651		goto failed_mount_wq;
4652	} else {
4653		/* Nojournal mode, all journal mount options are illegal */
4654		if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
4655			ext4_msg(sb, KERN_ERR, "can't mount with "
4656				 "journal_checksum, fs mounted w/o journal");
4657			goto failed_mount_wq;
4658		}
4659		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4660			ext4_msg(sb, KERN_ERR, "can't mount with "
4661				 "journal_async_commit, fs mounted w/o journal");
4662			goto failed_mount_wq;
4663		}
4664		if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
4665			ext4_msg(sb, KERN_ERR, "can't mount with "
4666				 "commit=%lu, fs mounted w/o journal",
4667				 sbi->s_commit_interval / HZ);
4668			goto failed_mount_wq;
4669		}
4670		if (EXT4_MOUNT_DATA_FLAGS &
4671		    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
4672			ext4_msg(sb, KERN_ERR, "can't mount with "
4673				 "data=, fs mounted w/o journal");
4674			goto failed_mount_wq;
4675		}
4676		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
4677		clear_opt(sb, JOURNAL_CHECKSUM);
4678		clear_opt(sb, DATA_FLAGS);
4679		clear_opt2(sb, JOURNAL_FAST_COMMIT);
4680		sbi->s_journal = NULL;
4681		needs_recovery = 0;
4682		goto no_journal;
4683	}
4684
4685	if (ext4_has_feature_64bit(sb) &&
4686	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4687				       JBD2_FEATURE_INCOMPAT_64BIT)) {
4688		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4689		goto failed_mount_wq;
4690	}
4691
4692	if (!set_journal_csum_feature_set(sb)) {
4693		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4694			 "feature set");
4695		goto failed_mount_wq;
4696	}
4697
4698	if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
4699		!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4700					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
4701		ext4_msg(sb, KERN_ERR,
4702			"Failed to set fast commit journal feature");
4703		goto failed_mount_wq;
4704	}
4705
4706	/* We have now updated the journal if required, so we can
4707	 * validate the data journaling mode. */
4708	switch (test_opt(sb, DATA_FLAGS)) {
4709	case 0:
4710		/* No mode set, assume a default based on the journal
4711		 * capabilities: ORDERED_DATA if the journal can
4712		 * cope, else JOURNAL_DATA
4713		 */
4714		if (jbd2_journal_check_available_features
4715		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4716			set_opt(sb, ORDERED_DATA);
4717			sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4718		} else {
4719			set_opt(sb, JOURNAL_DATA);
4720			sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4721		}
4722		break;
4723
4724	case EXT4_MOUNT_ORDERED_DATA:
4725	case EXT4_MOUNT_WRITEBACK_DATA:
4726		if (!jbd2_journal_check_available_features
4727		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4728			ext4_msg(sb, KERN_ERR, "Journal does not support "
4729			       "requested data journaling mode");
4730			goto failed_mount_wq;
4731		}
4732		break;
4733	default:
4734		break;
4735	}
4736
4737	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4738	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4739		ext4_msg(sb, KERN_ERR, "can't mount with "
4740			"journal_async_commit in data=ordered mode");
4741		goto failed_mount_wq;
4742	}
4743
4744	set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
4745
4746	sbi->s_journal->j_submit_inode_data_buffers =
4747		ext4_journal_submit_inode_data_buffers;
4748	sbi->s_journal->j_finish_inode_data_buffers =
4749		ext4_journal_finish_inode_data_buffers;
4750
4751no_journal:
4752	if (!test_opt(sb, NO_MBCACHE)) {
4753		sbi->s_ea_block_cache = ext4_xattr_create_cache();
4754		if (!sbi->s_ea_block_cache) {
4755			ext4_msg(sb, KERN_ERR,
4756				 "Failed to create ea_block_cache");
4757			goto failed_mount_wq;
4758		}
4759
4760		if (ext4_has_feature_ea_inode(sb)) {
4761			sbi->s_ea_inode_cache = ext4_xattr_create_cache();
4762			if (!sbi->s_ea_inode_cache) {
4763				ext4_msg(sb, KERN_ERR,
4764					 "Failed to create ea_inode_cache");
4765				goto failed_mount_wq;
4766			}
4767		}
4768	}
4769
4770	if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
4771		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
4772		goto failed_mount_wq;
4773	}
4774
4775	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
4776	    !ext4_has_feature_encrypt(sb)) {
4777		ext4_set_feature_encrypt(sb);
4778		ext4_commit_super(sb);
4779	}
4780
4781	/*
4782	 * Get the # of file system overhead blocks from the
4783	 * superblock if present.
4784	 */
4785	if (es->s_overhead_clusters)
4786		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4787	else {
4788		err = ext4_calculate_overhead(sb);
4789		if (err)
4790			goto failed_mount_wq;
4791	}
4792
4793	/*
4794	 * The maximum number of concurrent works can be high and
4795	 * concurrency isn't really necessary.  Limit it to 1.
4796	 */
4797	EXT4_SB(sb)->rsv_conversion_wq =
4798		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4799	if (!EXT4_SB(sb)->rsv_conversion_wq) {
4800		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4801		ret = -ENOMEM;
4802		goto failed_mount4;
4803	}
4804
4805	/*
4806	 * The jbd2_journal_load will have done any necessary log recovery,
4807	 * so we can safely mount the rest of the filesystem now.
4808	 */
4809
4810	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
4811	if (IS_ERR(root)) {
4812		ext4_msg(sb, KERN_ERR, "get root inode failed");
4813		ret = PTR_ERR(root);
4814		root = NULL;
4815		goto failed_mount4;
4816	}
4817	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4818		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4819		iput(root);
4820		goto failed_mount4;
4821	}
4822
4823	sb->s_root = d_make_root(root);
4824	if (!sb->s_root) {
4825		ext4_msg(sb, KERN_ERR, "get root dentry failed");
4826		ret = -ENOMEM;
4827		goto failed_mount4;
4828	}
4829
4830	ret = ext4_setup_super(sb, es, sb_rdonly(sb));
4831	if (ret == -EROFS) {
4832		sb->s_flags |= SB_RDONLY;
4833		ret = 0;
4834	} else if (ret)
4835		goto failed_mount4a;
4836
4837	ext4_set_resv_clusters(sb);
4838
4839	if (test_opt(sb, BLOCK_VALIDITY)) {
4840		err = ext4_setup_system_zone(sb);
4841		if (err) {
4842			ext4_msg(sb, KERN_ERR, "failed to initialize system "
4843				 "zone (%d)", err);
4844			goto failed_mount4a;
4845		}
4846	}
4847	ext4_fc_replay_cleanup(sb);
4848
4849	ext4_ext_init(sb);
4850
4851	/*
4852	 * Enable optimize_scan if number of groups is > threshold. This can be
4853	 * turned off by passing "mb_optimize_scan=0". This can also be
4854	 * turned on forcefully by passing "mb_optimize_scan=1".
4855	 */
4856	if (parsed_opts.mb_optimize_scan == 1)
4857		set_opt2(sb, MB_OPTIMIZE_SCAN);
4858	else if (parsed_opts.mb_optimize_scan == 0)
4859		clear_opt2(sb, MB_OPTIMIZE_SCAN);
4860	else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
4861		set_opt2(sb, MB_OPTIMIZE_SCAN);
4862
4863	err = ext4_mb_init(sb);
4864	if (err) {
4865		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4866			 err);
4867		goto failed_mount5;
4868	}
4869
4870	/*
4871	 * We can only set up the journal commit callback once
4872	 * mballoc is initialized
4873	 */
4874	if (sbi->s_journal)
4875		sbi->s_journal->j_commit_callback =
4876			ext4_journal_commit_callback;
4877
4878	block = ext4_count_free_clusters(sb);
4879	ext4_free_blocks_count_set(sbi->s_es,
4880				   EXT4_C2B(sbi, block));
4881	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4882				  GFP_KERNEL);
4883	if (!err) {
4884		unsigned long freei = ext4_count_free_inodes(sb);
4885		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4886		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4887					  GFP_KERNEL);
4888	}
4889	/*
4890	 * Update the checksum after updating free space/inode
4891	 * counters.  Otherwise the superblock can have an incorrect
4892	 * checksum in the buffer cache until it is written out and
4893	 * e2fsprogs programs trying to open a file system immediately
4894	 * after it is mounted can fail.
4895	 */
4896	ext4_superblock_csum_set(sb);
4897	if (!err)
4898		err = percpu_counter_init(&sbi->s_dirs_counter,
4899					  ext4_count_dirs(sb), GFP_KERNEL);
4900	if (!err)
4901		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4902					  GFP_KERNEL);
4903	if (!err)
4904		err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
4905					  GFP_KERNEL);
4906	if (!err)
4907		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
4908
4909	if (err) {
4910		ext4_msg(sb, KERN_ERR, "insufficient memory");
4911		goto failed_mount6;
4912	}
4913
4914	if (ext4_has_feature_flex_bg(sb))
4915		if (!ext4_fill_flex_info(sb)) {
4916			ext4_msg(sb, KERN_ERR,
4917			       "unable to initialize "
4918			       "flex_bg meta info!");
4919			ret = -ENOMEM;
4920			goto failed_mount6;
4921		}
4922
4923	err = ext4_register_li_request(sb, first_not_zeroed);
4924	if (err)
4925		goto failed_mount6;
4926
4927	err = ext4_register_sysfs(sb);
4928	if (err)
4929		goto failed_mount7;
4930
4931	err = ext4_init_orphan_info(sb);
4932	if (err)
4933		goto failed_mount8;
4934#ifdef CONFIG_QUOTA
4935	/* Enable quota usage during mount. */
4936	if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
4937		err = ext4_enable_quotas(sb);
4938		if (err)
4939			goto failed_mount9;
4940	}
4941#endif  /* CONFIG_QUOTA */
4942
4943	/*
4944	 * Save the original bdev mapping's wb_err value which could be
4945	 * used to detect the metadata async write error.
4946	 */
4947	spin_lock_init(&sbi->s_bdev_wb_lock);
4948	errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
4949				 &sbi->s_bdev_wb_err);
4950	sb->s_bdev->bd_super = sb;
4951	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
4952	ext4_orphan_cleanup(sb, es);
4953	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
4954	if (needs_recovery) {
4955		ext4_msg(sb, KERN_INFO, "recovery complete");
4956		err = ext4_mark_recovery_complete(sb, es);
4957		if (err)
4958			goto failed_mount9;
4959	}
4960	if (EXT4_SB(sb)->s_journal) {
4961		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
4962			descr = " journalled data mode";
4963		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
4964			descr = " ordered data mode";
4965		else
4966			descr = " writeback data mode";
4967	} else
4968		descr = "out journal";
4969
4970	if (test_opt(sb, DISCARD)) {
4971		struct request_queue *q = bdev_get_queue(sb->s_bdev);
4972		if (!blk_queue_discard(q))
4973			ext4_msg(sb, KERN_WARNING,
4974				 "mounting with \"discard\" option, but "
4975				 "the device does not support discard");
4976	}
4977
4978	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
4979		ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4980			 "Opts: %.*s%s%s. Quota mode: %s.", descr,
4981			 (int) sizeof(sbi->s_es->s_mount_opts),
4982			 sbi->s_es->s_mount_opts,
4983			 *sbi->s_es->s_mount_opts ? "; " : "", orig_data,
4984			 ext4_quota_mode(sb));
4985
4986	if (es->s_error_count)
4987		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
4988
4989	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
4990	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
4991	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
4992	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
4993	atomic_set(&sbi->s_warning_count, 0);
4994	atomic_set(&sbi->s_msg_count, 0);
4995
4996	kfree(orig_data);
4997	return 0;
4998
4999cantfind_ext4:
5000	if (!silent)
5001		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5002	goto failed_mount;
5003
5004failed_mount9:
5005	ext4_release_orphan_info(sb);
5006failed_mount8:
5007	ext4_unregister_sysfs(sb);
5008	kobject_put(&sbi->s_kobj);
5009failed_mount7:
5010	ext4_unregister_li_request(sb);
5011failed_mount6:
5012	ext4_mb_release(sb);
5013	rcu_read_lock();
5014	flex_groups = rcu_dereference(sbi->s_flex_groups);
5015	if (flex_groups) {
5016		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
5017			kvfree(flex_groups[i]);
5018		kvfree(flex_groups);
5019	}
5020	rcu_read_unlock();
5021	percpu_counter_destroy(&sbi->s_freeclusters_counter);
5022	percpu_counter_destroy(&sbi->s_freeinodes_counter);
5023	percpu_counter_destroy(&sbi->s_dirs_counter);
5024	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
5025	percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
5026	percpu_free_rwsem(&sbi->s_writepages_rwsem);
5027failed_mount5:
5028	ext4_ext_release(sb);
5029	ext4_release_system_zone(sb);
5030failed_mount4a:
5031	dput(sb->s_root);
5032	sb->s_root = NULL;
5033failed_mount4:
5034	ext4_msg(sb, KERN_ERR, "mount failed");
5035	if (EXT4_SB(sb)->rsv_conversion_wq)
5036		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
5037failed_mount_wq:
5038	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
5039	sbi->s_ea_inode_cache = NULL;
5040
5041	ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
5042	sbi->s_ea_block_cache = NULL;
5043
5044	if (sbi->s_journal) {
5045		jbd2_journal_destroy(sbi->s_journal);
5046		sbi->s_journal = NULL;
5047	}
5048failed_mount3a:
5049	ext4_es_unregister_shrinker(sbi);
5050failed_mount3:
5051	flush_work(&sbi->s_error_work);
5052	del_timer_sync(&sbi->s_err_report);
5053	ext4_stop_mmpd(sbi);
5054failed_mount2:
5055	rcu_read_lock();
5056	group_desc = rcu_dereference(sbi->s_group_desc);
5057	for (i = 0; i < db_count; i++)
5058		brelse(group_desc[i]);
5059	kvfree(group_desc);
5060	rcu_read_unlock();
5061failed_mount:
5062	if (sbi->s_chksum_driver)
5063		crypto_free_shash(sbi->s_chksum_driver);
5064
5065#ifdef CONFIG_UNICODE
5066	utf8_unload(sb->s_encoding);
5067#endif
5068
5069#ifdef CONFIG_QUOTA
5070	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5071		kfree(get_qf_name(sb, sbi, i));
5072#endif
5073	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
5074	/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
5075	brelse(bh);
5076	ext4_blkdev_remove(sbi);
5077out_fail:
5078	sb->s_fs_info = NULL;
5079	kfree(sbi->s_blockgroup_lock);
5080out_free_base:
5081	kfree(sbi);
5082	kfree(orig_data);
5083	fs_put_dax(dax_dev);
5084	return err ? err : ret;
5085}
5086
5087/*
5088 * Setup any per-fs journal parameters now.  We'll do this both on
5089 * initial mount, once the journal has been initialised but before we've
5090 * done any recovery; and again on any subsequent remount.
5091 */
5092static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
5093{
5094	struct ext4_sb_info *sbi = EXT4_SB(sb);
5095
5096	journal->j_commit_interval = sbi->s_commit_interval;
5097	journal->j_min_batch_time = sbi->s_min_batch_time;
5098	journal->j_max_batch_time = sbi->s_max_batch_time;
5099	ext4_fc_init(sb, journal);
5100
5101	write_lock(&journal->j_state_lock);
5102	if (test_opt(sb, BARRIER))
5103		journal->j_flags |= JBD2_BARRIER;
5104	else
5105		journal->j_flags &= ~JBD2_BARRIER;
5106	if (test_opt(sb, DATA_ERR_ABORT))
5107		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
5108	else
5109		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
5110	write_unlock(&journal->j_state_lock);
5111}
5112
5113static struct inode *ext4_get_journal_inode(struct super_block *sb,
5114					     unsigned int journal_inum)
5115{
5116	struct inode *journal_inode;
5117
5118	/*
5119	 * Test for the existence of a valid inode on disk.  Bad things
5120	 * happen if we iget() an unused inode, as the subsequent iput()
5121	 * will try to delete it.
5122	 */
5123	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
5124	if (IS_ERR(journal_inode)) {
5125		ext4_msg(sb, KERN_ERR, "no journal found");
5126		return NULL;
5127	}
5128	if (!journal_inode->i_nlink) {
5129		make_bad_inode(journal_inode);
5130		iput(journal_inode);
5131		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
5132		return NULL;
5133	}
5134
5135	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
5136		  journal_inode, journal_inode->i_size);
5137	if (!S_ISREG(journal_inode->i_mode)) {
5138		ext4_msg(sb, KERN_ERR, "invalid journal inode");
5139		iput(journal_inode);
5140		return NULL;
5141	}
5142	return journal_inode;
5143}
5144
5145static journal_t *ext4_get_journal(struct super_block *sb,
5146				   unsigned int journal_inum)
5147{
5148	struct inode *journal_inode;
5149	journal_t *journal;
5150
5151	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5152		return NULL;
5153
5154	journal_inode = ext4_get_journal_inode(sb, journal_inum);
5155	if (!journal_inode)
5156		return NULL;
5157
5158	journal = jbd2_journal_init_inode(journal_inode);
5159	if (!journal) {
5160		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
5161		iput(journal_inode);
5162		return NULL;
5163	}
5164	journal->j_private = sb;
5165	ext4_init_journal_params(sb, journal);
5166	return journal;
5167}
5168
5169static journal_t *ext4_get_dev_journal(struct super_block *sb,
5170				       dev_t j_dev)
5171{
5172	struct buffer_head *bh;
5173	journal_t *journal;
5174	ext4_fsblk_t start;
5175	ext4_fsblk_t len;
5176	int hblock, blocksize;
5177	ext4_fsblk_t sb_block;
5178	unsigned long offset;
5179	struct ext4_super_block *es;
5180	struct block_device *bdev;
5181
5182	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5183		return NULL;
5184
5185	bdev = ext4_blkdev_get(j_dev, sb);
5186	if (bdev == NULL)
5187		return NULL;
5188
5189	blocksize = sb->s_blocksize;
5190	hblock = bdev_logical_block_size(bdev);
5191	if (blocksize < hblock) {
5192		ext4_msg(sb, KERN_ERR,
5193			"blocksize too small for journal device");
5194		goto out_bdev;
5195	}
5196
5197	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
5198	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
5199	set_blocksize(bdev, blocksize);
5200	if (!(bh = __bread(bdev, sb_block, blocksize))) {
5201		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
5202		       "external journal");
5203		goto out_bdev;
5204	}
5205
5206	es = (struct ext4_super_block *) (bh->b_data + offset);
5207	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
5208	    !(le32_to_cpu(es->s_feature_incompat) &
5209	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
5210		ext4_msg(sb, KERN_ERR, "external journal has "
5211					"bad superblock");
5212		brelse(bh);
5213		goto out_bdev;
5214	}
5215
5216	if ((le32_to_cpu(es->s_feature_ro_compat) &
5217	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
5218	    es->s_checksum != ext4_superblock_csum(sb, es)) {
5219		ext4_msg(sb, KERN_ERR, "external journal has "
5220				       "corrupt superblock");
5221		brelse(bh);
5222		goto out_bdev;
5223	}
5224
5225	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
5226		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
5227		brelse(bh);
5228		goto out_bdev;
5229	}
5230
5231	len = ext4_blocks_count(es);
5232	start = sb_block + 1;
5233	brelse(bh);	/* we're done with the superblock */
5234
5235	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
5236					start, len, blocksize);
5237	if (!journal) {
5238		ext4_msg(sb, KERN_ERR, "failed to create device journal");
5239		goto out_bdev;
5240	}
5241	journal->j_private = sb;
5242	if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
5243		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
5244		goto out_journal;
5245	}
5246	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
5247		ext4_msg(sb, KERN_ERR, "External journal has more than one "
5248					"user (unsupported) - %d",
5249			be32_to_cpu(journal->j_superblock->s_nr_users));
5250		goto out_journal;
5251	}
5252	EXT4_SB(sb)->s_journal_bdev = bdev;
5253	ext4_init_journal_params(sb, journal);
5254	return journal;
5255
5256out_journal:
5257	jbd2_journal_destroy(journal);
5258out_bdev:
5259	ext4_blkdev_put(bdev);
5260	return NULL;
5261}
5262
5263static int ext4_load_journal(struct super_block *sb,
5264			     struct ext4_super_block *es,
5265			     unsigned long journal_devnum)
5266{
5267	journal_t *journal;
5268	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
5269	dev_t journal_dev;
5270	int err = 0;
5271	int really_read_only;
5272	int journal_dev_ro;
5273
5274	if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
5275		return -EFSCORRUPTED;
5276
5277	if (journal_devnum &&
5278	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5279		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
5280			"numbers have changed");
5281		journal_dev = new_decode_dev(journal_devnum);
5282	} else
5283		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
5284
5285	if (journal_inum && journal_dev) {
5286		ext4_msg(sb, KERN_ERR,
5287			 "filesystem has both journal inode and journal device!");
5288		return -EINVAL;
5289	}
5290
5291	if (journal_inum) {
5292		journal = ext4_get_journal(sb, journal_inum);
5293		if (!journal)
5294			return -EINVAL;
5295	} else {
5296		journal = ext4_get_dev_journal(sb, journal_dev);
5297		if (!journal)
5298			return -EINVAL;
5299	}
5300
5301	journal_dev_ro = bdev_read_only(journal->j_dev);
5302	really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
5303
5304	if (journal_dev_ro && !sb_rdonly(sb)) {
5305		ext4_msg(sb, KERN_ERR,
5306			 "journal device read-only, try mounting with '-o ro'");
5307		err = -EROFS;
5308		goto err_out;
5309	}
5310
5311	/*
5312	 * Are we loading a blank journal or performing recovery after a
5313	 * crash?  For recovery, we need to check in advance whether we
5314	 * can get read-write access to the device.
5315	 */
5316	if (ext4_has_feature_journal_needs_recovery(sb)) {
5317		if (sb_rdonly(sb)) {
5318			ext4_msg(sb, KERN_INFO, "INFO: recovery "
5319					"required on readonly filesystem");
5320			if (really_read_only) {
5321				ext4_msg(sb, KERN_ERR, "write access "
5322					"unavailable, cannot proceed "
5323					"(try mounting with noload)");
5324				err = -EROFS;
5325				goto err_out;
5326			}
5327			ext4_msg(sb, KERN_INFO, "write access will "
5328			       "be enabled during recovery");
5329		}
5330	}
5331
5332	if (!(journal->j_flags & JBD2_BARRIER))
5333		ext4_msg(sb, KERN_INFO, "barriers disabled");
5334
5335	if (!ext4_has_feature_journal_needs_recovery(sb))
5336		err = jbd2_journal_wipe(journal, !really_read_only);
5337	if (!err) {
5338		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
5339		if (save)
5340			memcpy(save, ((char *) es) +
5341			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
5342		err = jbd2_journal_load(journal);
5343		if (save)
5344			memcpy(((char *) es) + EXT4_S_ERR_START,
5345			       save, EXT4_S_ERR_LEN);
5346		kfree(save);
5347	}
5348
5349	if (err) {
5350		ext4_msg(sb, KERN_ERR, "error loading journal");
5351		goto err_out;
5352	}
5353
5354	EXT4_SB(sb)->s_journal = journal;
5355	err = ext4_clear_journal_err(sb, es);
5356	if (err) {
5357		EXT4_SB(sb)->s_journal = NULL;
5358		jbd2_journal_destroy(journal);
5359		return err;
5360	}
5361
5362	if (!really_read_only && journal_devnum &&
5363	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5364		es->s_journal_dev = cpu_to_le32(journal_devnum);
5365
5366		/* Make sure we flush the recovery flag to disk. */
5367		ext4_commit_super(sb);
5368	}
5369
5370	return 0;
5371
5372err_out:
5373	jbd2_journal_destroy(journal);
5374	return err;
5375}
5376
5377/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
5378static void ext4_update_super(struct super_block *sb)
5379{
5380	struct ext4_sb_info *sbi = EXT4_SB(sb);
5381	struct ext4_super_block *es = sbi->s_es;
5382	struct buffer_head *sbh = sbi->s_sbh;
5383
5384	lock_buffer(sbh);
5385	/*
5386	 * If the file system is mounted read-only, don't update the
5387	 * superblock write time.  This avoids updating the superblock
5388	 * write time when we are mounting the root file system
5389	 * read/only but we need to replay the journal; at that point,
5390	 * for people who are east of GMT and who make their clock
5391	 * tick in localtime for Windows bug-for-bug compatibility,
5392	 * the clock is set in the future, and this will cause e2fsck
5393	 * to complain and force a full file system check.
5394	 */
5395	if (!(sb->s_flags & SB_RDONLY))
5396		ext4_update_tstamp(es, s_wtime);
5397	es->s_kbytes_written =
5398		cpu_to_le64(sbi->s_kbytes_written +
5399		    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
5400		      sbi->s_sectors_written_start) >> 1));
5401	if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
5402		ext4_free_blocks_count_set(es,
5403			EXT4_C2B(sbi, percpu_counter_sum_positive(
5404				&sbi->s_freeclusters_counter)));
5405	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
5406		es->s_free_inodes_count =
5407			cpu_to_le32(percpu_counter_sum_positive(
5408				&sbi->s_freeinodes_counter));
5409	/* Copy error information to the on-disk superblock */
5410	spin_lock(&sbi->s_error_lock);
5411	if (sbi->s_add_error_count > 0) {
5412		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
5413		if (!es->s_first_error_time && !es->s_first_error_time_hi) {
5414			__ext4_update_tstamp(&es->s_first_error_time,
5415					     &es->s_first_error_time_hi,
5416					     sbi->s_first_error_time);
5417			strncpy(es->s_first_error_func, sbi->s_first_error_func,
5418				sizeof(es->s_first_error_func));
5419			es->s_first_error_line =
5420				cpu_to_le32(sbi->s_first_error_line);
5421			es->s_first_error_ino =
5422				cpu_to_le32(sbi->s_first_error_ino);
5423			es->s_first_error_block =
5424				cpu_to_le64(sbi->s_first_error_block);
5425			es->s_first_error_errcode =
5426				ext4_errno_to_code(sbi->s_first_error_code);
5427		}
5428		__ext4_update_tstamp(&es->s_last_error_time,
5429				     &es->s_last_error_time_hi,
5430				     sbi->s_last_error_time);
5431		strncpy(es->s_last_error_func, sbi->s_last_error_func,
5432			sizeof(es->s_last_error_func));
5433		es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
5434		es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
5435		es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
5436		es->s_last_error_errcode =
5437				ext4_errno_to_code(sbi->s_last_error_code);
5438		/*
5439		 * Start the daily error reporting function if it hasn't been
5440		 * started already
5441		 */
5442		if (!es->s_error_count)
5443			mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
5444		le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
5445		sbi->s_add_error_count = 0;
5446	}
5447	spin_unlock(&sbi->s_error_lock);
5448
5449	ext4_superblock_csum_set(sb);
5450	unlock_buffer(sbh);
5451}
5452
5453static int ext4_commit_super(struct super_block *sb)
5454{
5455	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
5456	int error = 0;
5457
5458	if (!sbh)
5459		return -EINVAL;
5460	if (block_device_ejected(sb))
5461		return -ENODEV;
5462
5463	ext4_update_super(sb);
5464
5465	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
5466		/*
5467		 * Oh, dear.  A previous attempt to write the
5468		 * superblock failed.  This could happen because the
5469		 * USB device was yanked out.  Or it could happen to
5470		 * be a transient write error and maybe the block will
5471		 * be remapped.  Nothing we can do but to retry the
5472		 * write and hope for the best.
5473		 */
5474		ext4_msg(sb, KERN_ERR, "previous I/O error to "
5475		       "superblock detected");
5476		clear_buffer_write_io_error(sbh);
5477		set_buffer_uptodate(sbh);
5478	}
5479	BUFFER_TRACE(sbh, "marking dirty");
5480	mark_buffer_dirty(sbh);
5481	error = __sync_dirty_buffer(sbh,
5482		REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
5483	if (buffer_write_io_error(sbh)) {
5484		ext4_msg(sb, KERN_ERR, "I/O error while writing "
5485		       "superblock");
5486		clear_buffer_write_io_error(sbh);
5487		set_buffer_uptodate(sbh);
5488	}
5489	return error;
5490}
5491
5492/*
5493 * Have we just finished recovery?  If so, and if we are mounting (or
5494 * remounting) the filesystem readonly, then we will end up with a
5495 * consistent fs on disk.  Record that fact.
5496 */
5497static int ext4_mark_recovery_complete(struct super_block *sb,
5498				       struct ext4_super_block *es)
5499{
5500	int err;
5501	journal_t *journal = EXT4_SB(sb)->s_journal;
5502
5503	if (!ext4_has_feature_journal(sb)) {
5504		if (journal != NULL) {
5505			ext4_error(sb, "Journal got removed while the fs was "
5506				   "mounted!");
5507			return -EFSCORRUPTED;
5508		}
5509		return 0;
5510	}
5511	jbd2_journal_lock_updates(journal);
5512	err = jbd2_journal_flush(journal, 0);
5513	if (err < 0)
5514		goto out;
5515
5516	if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
5517	    ext4_has_feature_orphan_present(sb))) {
5518		if (!ext4_orphan_file_empty(sb)) {
5519			ext4_error(sb, "Orphan file not empty on read-only fs.");
5520			err = -EFSCORRUPTED;
5521			goto out;
5522		}
5523		ext4_clear_feature_journal_needs_recovery(sb);
5524		ext4_clear_feature_orphan_present(sb);
5525		ext4_commit_super(sb);
5526	}
5527out:
5528	jbd2_journal_unlock_updates(journal);
5529	return err;
5530}
5531
5532/*
5533 * If we are mounting (or read-write remounting) a filesystem whose journal
5534 * has recorded an error from a previous lifetime, move that error to the
5535 * main filesystem now.
5536 */
5537static int ext4_clear_journal_err(struct super_block *sb,
5538				   struct ext4_super_block *es)
5539{
5540	journal_t *journal;
5541	int j_errno;
5542	const char *errstr;
5543
5544	if (!ext4_has_feature_journal(sb)) {
5545		ext4_error(sb, "Journal got removed while the fs was mounted!");
5546		return -EFSCORRUPTED;
5547	}
5548
5549	journal = EXT4_SB(sb)->s_journal;
5550
5551	/*
5552	 * Now check for any error status which may have been recorded in the
5553	 * journal by a prior ext4_error() or ext4_abort()
5554	 */
5555
5556	j_errno = jbd2_journal_errno(journal);
5557	if (j_errno) {
5558		char nbuf[16];
5559
5560		errstr = ext4_decode_error(sb, j_errno, nbuf);
5561		ext4_warning(sb, "Filesystem error recorded "
5562			     "from previous mount: %s", errstr);
5563		ext4_warning(sb, "Marking fs in need of filesystem check.");
5564
5565		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
5566		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
5567		ext4_commit_super(sb);
5568
5569		jbd2_journal_clear_err(journal);
5570		jbd2_journal_update_sb_errno(journal);
5571	}
5572	return 0;
5573}
5574
5575/*
5576 * Force the running and committing transactions to commit,
5577 * and wait on the commit.
5578 */
5579int ext4_force_commit(struct super_block *sb)
5580{
5581	journal_t *journal;
5582
5583	if (sb_rdonly(sb))
5584		return 0;
5585
5586	journal = EXT4_SB(sb)->s_journal;
5587	return ext4_journal_force_commit(journal);
5588}
5589
5590static int ext4_sync_fs(struct super_block *sb, int wait)
5591{
5592	int ret = 0;
5593	tid_t target;
5594	bool needs_barrier = false;
5595	struct ext4_sb_info *sbi = EXT4_SB(sb);
5596
5597	if (unlikely(ext4_forced_shutdown(sbi)))
5598		return 0;
5599
5600	trace_ext4_sync_fs(sb, wait);
5601	flush_workqueue(sbi->rsv_conversion_wq);
5602	/*
5603	 * Writeback quota in non-journalled quota case - journalled quota has
5604	 * no dirty dquots
5605	 */
5606	dquot_writeback_dquots(sb, -1);
5607	/*
5608	 * Data writeback is possible w/o journal transaction, so barrier must
5609	 * being sent at the end of the function. But we can skip it if
5610	 * transaction_commit will do it for us.
5611	 */
5612	if (sbi->s_journal) {
5613		target = jbd2_get_latest_transaction(sbi->s_journal);
5614		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
5615		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
5616			needs_barrier = true;
5617
5618		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
5619			if (wait)
5620				ret = jbd2_log_wait_commit(sbi->s_journal,
5621							   target);
5622		}
5623	} else if (wait && test_opt(sb, BARRIER))
5624		needs_barrier = true;
5625	if (needs_barrier) {
5626		int err;
5627		err = blkdev_issue_flush(sb->s_bdev);
5628		if (!ret)
5629			ret = err;
5630	}
5631
5632	return ret;
5633}
5634
5635/*
5636 * LVM calls this function before a (read-only) snapshot is created.  This
5637 * gives us a chance to flush the journal completely and mark the fs clean.
5638 *
5639 * Note that only this function cannot bring a filesystem to be in a clean
5640 * state independently. It relies on upper layer to stop all data & metadata
5641 * modifications.
5642 */
5643static int ext4_freeze(struct super_block *sb)
5644{
5645	int error = 0;
5646	journal_t *journal;
5647
5648	if (sb_rdonly(sb))
5649		return 0;
5650
5651	journal = EXT4_SB(sb)->s_journal;
5652
5653	if (journal) {
5654		/* Now we set up the journal barrier. */
5655		jbd2_journal_lock_updates(journal);
5656
5657		/*
5658		 * Don't clear the needs_recovery flag if we failed to
5659		 * flush the journal.
5660		 */
5661		error = jbd2_journal_flush(journal, 0);
5662		if (error < 0)
5663			goto out;
5664
5665		/* Journal blocked and flushed, clear needs_recovery flag. */
5666		ext4_clear_feature_journal_needs_recovery(sb);
5667		if (ext4_orphan_file_empty(sb))
5668			ext4_clear_feature_orphan_present(sb);
5669	}
5670
5671	error = ext4_commit_super(sb);
5672out:
5673	if (journal)
5674		/* we rely on upper layer to stop further updates */
5675		jbd2_journal_unlock_updates(journal);
5676	return error;
5677}
5678
5679/*
5680 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
5681 * flag here, even though the filesystem is not technically dirty yet.
5682 */
5683static int ext4_unfreeze(struct super_block *sb)
5684{
5685	if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
5686		return 0;
5687
5688	if (EXT4_SB(sb)->s_journal) {
5689		/* Reset the needs_recovery flag before the fs is unlocked. */
5690		ext4_set_feature_journal_needs_recovery(sb);
5691		if (ext4_has_feature_orphan_file(sb))
5692			ext4_set_feature_orphan_present(sb);
5693	}
5694
5695	ext4_commit_super(sb);
5696	return 0;
5697}
5698
5699/*
5700 * Structure to save mount options for ext4_remount's benefit
5701 */
5702struct ext4_mount_options {
5703	unsigned long s_mount_opt;
5704	unsigned long s_mount_opt2;
5705	kuid_t s_resuid;
5706	kgid_t s_resgid;
5707	unsigned long s_commit_interval;
5708	u32 s_min_batch_time, s_max_batch_time;
5709#ifdef CONFIG_QUOTA
5710	int s_jquota_fmt;
5711	char *s_qf_names[EXT4_MAXQUOTAS];
5712#endif
5713};
5714
5715static int ext4_remount(struct super_block *sb, int *flags, char *data)
5716{
5717	struct ext4_super_block *es;
5718	struct ext4_sb_info *sbi = EXT4_SB(sb);
5719	unsigned long old_sb_flags, vfs_flags;
5720	struct ext4_mount_options old_opts;
5721	int enable_quota = 0;
5722	ext4_group_t g;
5723	int err = 0;
5724#ifdef CONFIG_QUOTA
5725	int i, j;
5726	char *to_free[EXT4_MAXQUOTAS];
5727#endif
5728	char *orig_data = kstrdup(data, GFP_KERNEL);
5729	struct ext4_parsed_options parsed_opts;
5730
5731	parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5732	parsed_opts.journal_devnum = 0;
5733
5734	if (data && !orig_data)
5735		return -ENOMEM;
5736
5737	/* Store the original options */
5738	old_sb_flags = sb->s_flags;
5739	old_opts.s_mount_opt = sbi->s_mount_opt;
5740	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
5741	old_opts.s_resuid = sbi->s_resuid;
5742	old_opts.s_resgid = sbi->s_resgid;
5743	old_opts.s_commit_interval = sbi->s_commit_interval;
5744	old_opts.s_min_batch_time = sbi->s_min_batch_time;
5745	old_opts.s_max_batch_time = sbi->s_max_batch_time;
5746#ifdef CONFIG_QUOTA
5747	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
5748	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5749		if (sbi->s_qf_names[i]) {
5750			char *qf_name = get_qf_name(sb, sbi, i);
5751
5752			old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
5753			if (!old_opts.s_qf_names[i]) {
5754				for (j = 0; j < i; j++)
5755					kfree(old_opts.s_qf_names[j]);
5756				kfree(orig_data);
5757				return -ENOMEM;
5758			}
5759		} else
5760			old_opts.s_qf_names[i] = NULL;
5761#endif
5762	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
5763		parsed_opts.journal_ioprio =
5764			sbi->s_journal->j_task->io_context->ioprio;
5765
5766	/*
5767	 * Some options can be enabled by ext4 and/or by VFS mount flag
5768	 * either way we need to make sure it matches in both *flags and
5769	 * s_flags. Copy those selected flags from *flags to s_flags
5770	 */
5771	vfs_flags = SB_LAZYTIME | SB_I_VERSION;
5772	sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
5773
5774	if (!parse_options(data, sb, &parsed_opts, 1)) {
5775		err = -EINVAL;
5776		goto restore_opts;
5777	}
5778
5779	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
5780	    test_opt(sb, JOURNAL_CHECKSUM)) {
5781		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
5782			 "during remount not supported; ignoring");
5783		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
5784	}
5785
5786	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
5787		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
5788			ext4_msg(sb, KERN_ERR, "can't mount with "
5789				 "both data=journal and delalloc");
5790			err = -EINVAL;
5791			goto restore_opts;
5792		}
5793		if (test_opt(sb, DIOREAD_NOLOCK)) {
5794			ext4_msg(sb, KERN_ERR, "can't mount with "
5795				 "both data=journal and dioread_nolock");
5796			err = -EINVAL;
5797			goto restore_opts;
5798		}
5799	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
5800		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5801			ext4_msg(sb, KERN_ERR, "can't mount with "
5802				"journal_async_commit in data=ordered mode");
5803			err = -EINVAL;
5804			goto restore_opts;
5805		}
5806	}
5807
5808	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
5809		ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
5810		err = -EINVAL;
5811		goto restore_opts;
5812	}
5813
5814	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
5815		ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
5816
5817	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5818		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5819
5820	es = sbi->s_es;
5821
5822	if (sbi->s_journal) {
5823		ext4_init_journal_params(sb, sbi->s_journal);
5824		set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio);
5825	}
5826
5827	/* Flush outstanding errors before changing fs state */
5828	flush_work(&sbi->s_error_work);
5829
5830	if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
5831		if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
5832			err = -EROFS;
5833			goto restore_opts;
5834		}
5835
5836		if (*flags & SB_RDONLY) {
5837			err = sync_filesystem(sb);
5838			if (err < 0)
5839				goto restore_opts;
5840			err = dquot_suspend(sb, -1);
5841			if (err < 0)
5842				goto restore_opts;
5843
5844			/*
5845			 * First of all, the unconditional stuff we have to do
5846			 * to disable replay of the journal when we next remount
5847			 */
5848			sb->s_flags |= SB_RDONLY;
5849
5850			/*
5851			 * OK, test if we are remounting a valid rw partition
5852			 * readonly, and if so set the rdonly flag and then
5853			 * mark the partition as valid again.
5854			 */
5855			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
5856			    (sbi->s_mount_state & EXT4_VALID_FS))
5857				es->s_state = cpu_to_le16(sbi->s_mount_state);
5858
5859			if (sbi->s_journal) {
5860				/*
5861				 * We let remount-ro finish even if marking fs
5862				 * as clean failed...
5863				 */
5864				ext4_mark_recovery_complete(sb, es);
5865			}
5866		} else {
5867			/* Make sure we can mount this feature set readwrite */
5868			if (ext4_has_feature_readonly(sb) ||
5869			    !ext4_feature_set_ok(sb, 0)) {
5870				err = -EROFS;
5871				goto restore_opts;
5872			}
5873			/*
5874			 * Make sure the group descriptor checksums
5875			 * are sane.  If they aren't, refuse to remount r/w.
5876			 */
5877			for (g = 0; g < sbi->s_groups_count; g++) {
5878				struct ext4_group_desc *gdp =
5879					ext4_get_group_desc(sb, g, NULL);
5880
5881				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
5882					ext4_msg(sb, KERN_ERR,
5883	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
5884		g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
5885					       le16_to_cpu(gdp->bg_checksum));
5886					err = -EFSBADCRC;
5887					goto restore_opts;
5888				}
5889			}
5890
5891			/*
5892			 * If we have an unprocessed orphan list hanging
5893			 * around from a previously readonly bdev mount,
5894			 * require a full umount/remount for now.
5895			 */
5896			if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
5897				ext4_msg(sb, KERN_WARNING, "Couldn't "
5898				       "remount RDWR because of unprocessed "
5899				       "orphan inode list.  Please "
5900				       "umount/remount instead");
5901				err = -EINVAL;
5902				goto restore_opts;
5903			}
5904
5905			/*
5906			 * Mounting a RDONLY partition read-write, so reread
5907			 * and store the current valid flag.  (It may have
5908			 * been changed by e2fsck since we originally mounted
5909			 * the partition.)
5910			 */
5911			if (sbi->s_journal) {
5912				err = ext4_clear_journal_err(sb, es);
5913				if (err)
5914					goto restore_opts;
5915			}
5916			sbi->s_mount_state = le16_to_cpu(es->s_state);
5917
5918			err = ext4_setup_super(sb, es, 0);
5919			if (err)
5920				goto restore_opts;
5921
5922			sb->s_flags &= ~SB_RDONLY;
5923			if (ext4_has_feature_mmp(sb))
5924				if (ext4_multi_mount_protect(sb,
5925						le64_to_cpu(es->s_mmp_block))) {
5926					err = -EROFS;
5927					goto restore_opts;
5928				}
5929			enable_quota = 1;
5930		}
5931	}
5932
5933	/*
5934	 * Reinitialize lazy itable initialization thread based on
5935	 * current settings
5936	 */
5937	if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
5938		ext4_unregister_li_request(sb);
5939	else {
5940		ext4_group_t first_not_zeroed;
5941		first_not_zeroed = ext4_has_uninit_itable(sb);
5942		ext4_register_li_request(sb, first_not_zeroed);
5943	}
5944
5945	/*
5946	 * Handle creation of system zone data early because it can fail.
5947	 * Releasing of existing data is done when we are sure remount will
5948	 * succeed.
5949	 */
5950	if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
5951		err = ext4_setup_system_zone(sb);
5952		if (err)
5953			goto restore_opts;
5954	}
5955
5956	if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
5957		err = ext4_commit_super(sb);
5958		if (err)
5959			goto restore_opts;
5960	}
5961
5962#ifdef CONFIG_QUOTA
5963	/* Release old quota file names */
5964	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5965		kfree(old_opts.s_qf_names[i]);
5966	if (enable_quota) {
5967		if (sb_any_quota_suspended(sb))
5968			dquot_resume(sb, -1);
5969		else if (ext4_has_feature_quota(sb)) {
5970			err = ext4_enable_quotas(sb);
5971			if (err)
5972				goto restore_opts;
5973		}
5974	}
5975#endif
5976	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
5977		ext4_release_system_zone(sb);
5978
5979	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
5980		ext4_stop_mmpd(sbi);
5981
5982	/*
5983	 * Some options can be enabled by ext4 and/or by VFS mount flag
5984	 * either way we need to make sure it matches in both *flags and
5985	 * s_flags. Copy those selected flags from s_flags to *flags
5986	 */
5987	*flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
5988
5989	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.",
5990		 orig_data, ext4_quota_mode(sb));
5991	kfree(orig_data);
5992	return 0;
5993
5994restore_opts:
5995	sb->s_flags = old_sb_flags;
5996	sbi->s_mount_opt = old_opts.s_mount_opt;
5997	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
5998	sbi->s_resuid = old_opts.s_resuid;
5999	sbi->s_resgid = old_opts.s_resgid;
6000	sbi->s_commit_interval = old_opts.s_commit_interval;
6001	sbi->s_min_batch_time = old_opts.s_min_batch_time;
6002	sbi->s_max_batch_time = old_opts.s_max_batch_time;
6003	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6004		ext4_release_system_zone(sb);
6005#ifdef CONFIG_QUOTA
6006	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
6007	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
6008		to_free[i] = get_qf_name(sb, sbi, i);
6009		rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
6010	}
6011	synchronize_rcu();
6012	for (i = 0; i < EXT4_MAXQUOTAS; i++)
6013		kfree(to_free[i]);
6014#endif
6015	if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6016		ext4_stop_mmpd(sbi);
6017	kfree(orig_data);
6018	return err;
6019}
6020
6021#ifdef CONFIG_QUOTA
6022static int ext4_statfs_project(struct super_block *sb,
6023			       kprojid_t projid, struct kstatfs *buf)
6024{
6025	struct kqid qid;
6026	struct dquot *dquot;
6027	u64 limit;
6028	u64 curblock;
6029
6030	qid = make_kqid_projid(projid);
6031	dquot = dqget(sb, qid);
6032	if (IS_ERR(dquot))
6033		return PTR_ERR(dquot);
6034	spin_lock(&dquot->dq_dqb_lock);
6035
6036	limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
6037			     dquot->dq_dqb.dqb_bhardlimit);
6038	limit >>= sb->s_blocksize_bits;
6039
6040	if (limit && buf->f_blocks > limit) {
6041		curblock = (dquot->dq_dqb.dqb_curspace +
6042			    dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
6043		buf->f_blocks = limit;
6044		buf->f_bfree = buf->f_bavail =
6045			(buf->f_blocks > curblock) ?
6046			 (buf->f_blocks - curblock) : 0;
6047	}
6048
6049	limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
6050			     dquot->dq_dqb.dqb_ihardlimit);
6051	if (limit && buf->f_files > limit) {
6052		buf->f_files = limit;
6053		buf->f_ffree =
6054			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
6055			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
6056	}
6057
6058	spin_unlock(&dquot->dq_dqb_lock);
6059	dqput(dquot);
6060	return 0;
6061}
6062#endif
6063
6064static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
6065{
6066	struct super_block *sb = dentry->d_sb;
6067	struct ext4_sb_info *sbi = EXT4_SB(sb);
6068	struct ext4_super_block *es = sbi->s_es;
6069	ext4_fsblk_t overhead = 0, resv_blocks;
6070	s64 bfree;
6071	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
6072
6073	if (!test_opt(sb, MINIX_DF))
6074		overhead = sbi->s_overhead;
6075
6076	buf->f_type = EXT4_SUPER_MAGIC;
6077	buf->f_bsize = sb->s_blocksize;
6078	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
6079	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
6080		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
6081	/* prevent underflow in case that few free space is available */
6082	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
6083	buf->f_bavail = buf->f_bfree -
6084			(ext4_r_blocks_count(es) + resv_blocks);
6085	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
6086		buf->f_bavail = 0;
6087	buf->f_files = le32_to_cpu(es->s_inodes_count);
6088	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
6089	buf->f_namelen = EXT4_NAME_LEN;
6090	buf->f_fsid = uuid_to_fsid(es->s_uuid);
6091
6092#ifdef CONFIG_QUOTA
6093	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
6094	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
6095		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
6096#endif
6097	return 0;
6098}
6099
6100
6101#ifdef CONFIG_QUOTA
6102
6103/*
6104 * Helper functions so that transaction is started before we acquire dqio_sem
6105 * to keep correct lock ordering of transaction > dqio_sem
6106 */
6107static inline struct inode *dquot_to_inode(struct dquot *dquot)
6108{
6109	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
6110}
6111
6112static int ext4_write_dquot(struct dquot *dquot)
6113{
6114	int ret, err;
6115	handle_t *handle;
6116	struct inode *inode;
6117
6118	inode = dquot_to_inode(dquot);
6119	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
6120				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
6121	if (IS_ERR(handle))
6122		return PTR_ERR(handle);
6123	ret = dquot_commit(dquot);
6124	err = ext4_journal_stop(handle);
6125	if (!ret)
6126		ret = err;
6127	return ret;
6128}
6129
6130static int ext4_acquire_dquot(struct dquot *dquot)
6131{
6132	int ret, err;
6133	handle_t *handle;
6134
6135	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6136				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
6137	if (IS_ERR(handle))
6138		return PTR_ERR(handle);
6139	ret = dquot_acquire(dquot);
6140	err = ext4_journal_stop(handle);
6141	if (!ret)
6142		ret = err;
6143	return ret;
6144}
6145
6146static int ext4_release_dquot(struct dquot *dquot)
6147{
6148	int ret, err;
6149	handle_t *handle;
6150
6151	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
6152				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
6153	if (IS_ERR(handle)) {
6154		/* Release dquot anyway to avoid endless cycle in dqput() */
6155		dquot_release(dquot);
6156		return PTR_ERR(handle);
6157	}
6158	ret = dquot_release(dquot);
6159	err = ext4_journal_stop(handle);
6160	if (!ret)
6161		ret = err;
6162	return ret;
6163}
6164
6165static int ext4_mark_dquot_dirty(struct dquot *dquot)
6166{
6167	struct super_block *sb = dquot->dq_sb;
6168
6169	if (ext4_is_quota_journalled(sb)) {
6170		dquot_mark_dquot_dirty(dquot);
6171		return ext4_write_dquot(dquot);
6172	} else {
6173		return dquot_mark_dquot_dirty(dquot);
6174	}
6175}
6176
6177static int ext4_write_info(struct super_block *sb, int type)
6178{
6179	int ret, err;
6180	handle_t *handle;
6181
6182	/* Data block + inode block */
6183	handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
6184	if (IS_ERR(handle))
6185		return PTR_ERR(handle);
6186	ret = dquot_commit_info(sb, type);
6187	err = ext4_journal_stop(handle);
6188	if (!ret)
6189		ret = err;
6190	return ret;
6191}
6192
6193static void lockdep_set_quota_inode(struct inode *inode, int subclass)
6194{
6195	struct ext4_inode_info *ei = EXT4_I(inode);
6196
6197	/* The first argument of lockdep_set_subclass has to be
6198	 * *exactly* the same as the argument to init_rwsem() --- in
6199	 * this case, in init_once() --- or lockdep gets unhappy
6200	 * because the name of the lock is set using the
6201	 * stringification of the argument to init_rwsem().
6202	 */
6203	(void) ei;	/* shut up clang warning if !CONFIG_LOCKDEP */
6204	lockdep_set_subclass(&ei->i_data_sem, subclass);
6205}
6206
6207/*
6208 * Standard function to be called on quota_on
6209 */
6210static int ext4_quota_on(struct super_block *sb, int type, int format_id,
6211			 const struct path *path)
6212{
6213	int err;
6214
6215	if (!test_opt(sb, QUOTA))
6216		return -EINVAL;
6217
6218	/* Quotafile not on the same filesystem? */
6219	if (path->dentry->d_sb != sb)
6220		return -EXDEV;
6221
6222	/* Quota already enabled for this file? */
6223	if (IS_NOQUOTA(d_inode(path->dentry)))
6224		return -EBUSY;
6225
6226	/* Journaling quota? */
6227	if (EXT4_SB(sb)->s_qf_names[type]) {
6228		/* Quotafile not in fs root? */
6229		if (path->dentry->d_parent != sb->s_root)
6230			ext4_msg(sb, KERN_WARNING,
6231				"Quota file not on filesystem root. "
6232				"Journaled quota will not work");
6233		sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
6234	} else {
6235		/*
6236		 * Clear the flag just in case mount options changed since
6237		 * last time.
6238		 */
6239		sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
6240	}
6241
6242	/*
6243	 * When we journal data on quota file, we have to flush journal to see
6244	 * all updates to the file when we bypass pagecache...
6245	 */
6246	if (EXT4_SB(sb)->s_journal &&
6247	    ext4_should_journal_data(d_inode(path->dentry))) {
6248		/*
6249		 * We don't need to lock updates but journal_flush() could
6250		 * otherwise be livelocked...
6251		 */
6252		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
6253		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
6254		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
6255		if (err)
6256			return err;
6257	}
6258
6259	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
6260	err = dquot_quota_on(sb, type, format_id, path);
6261	if (err) {
6262		lockdep_set_quota_inode(path->dentry->d_inode,
6263					     I_DATA_SEM_NORMAL);
6264	} else {
6265		struct inode *inode = d_inode(path->dentry);
6266		handle_t *handle;
6267
6268		/*
6269		 * Set inode flags to prevent userspace from messing with quota
6270		 * files. If this fails, we return success anyway since quotas
6271		 * are already enabled and this is not a hard failure.
6272		 */
6273		inode_lock(inode);
6274		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6275		if (IS_ERR(handle))
6276			goto unlock_inode;
6277		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
6278		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
6279				S_NOATIME | S_IMMUTABLE);
6280		err = ext4_mark_inode_dirty(handle, inode);
6281		ext4_journal_stop(handle);
6282	unlock_inode:
6283		inode_unlock(inode);
6284	}
6285	return err;
6286}
6287
6288static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
6289			     unsigned int flags)
6290{
6291	int err;
6292	struct inode *qf_inode;
6293	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
6294		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
6295		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
6296		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
6297	};
6298
6299	BUG_ON(!ext4_has_feature_quota(sb));
6300
6301	if (!qf_inums[type])
6302		return -EPERM;
6303
6304	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
6305	if (IS_ERR(qf_inode)) {
6306		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
6307		return PTR_ERR(qf_inode);
6308	}
6309
6310	/* Don't account quota for quota files to avoid recursion */
6311	qf_inode->i_flags |= S_NOQUOTA;
6312	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
6313	err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
6314	if (err)
6315		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
6316	iput(qf_inode);
6317
6318	return err;
6319}
6320
6321/* Enable usage tracking for all quota types. */
6322int ext4_enable_quotas(struct super_block *sb)
6323{
6324	int type, err = 0;
6325	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
6326		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
6327		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
6328		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
6329	};
6330	bool quota_mopt[EXT4_MAXQUOTAS] = {
6331		test_opt(sb, USRQUOTA),
6332		test_opt(sb, GRPQUOTA),
6333		test_opt(sb, PRJQUOTA),
6334	};
6335
6336	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
6337	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
6338		if (qf_inums[type]) {
6339			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
6340				DQUOT_USAGE_ENABLED |
6341				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
6342			if (err) {
6343				ext4_warning(sb,
6344					"Failed to enable quota tracking "
6345					"(type=%d, err=%d). Please run "
6346					"e2fsck to fix.", type, err);
6347				for (type--; type >= 0; type--)
6348					dquot_quota_off(sb, type);
6349
6350				return err;
6351			}
6352		}
6353	}
6354	return 0;
6355}
6356
6357static int ext4_quota_off(struct super_block *sb, int type)
6358{
6359	struct inode *inode = sb_dqopt(sb)->files[type];
6360	handle_t *handle;
6361	int err;
6362
6363	/* Force all delayed allocation blocks to be allocated.
6364	 * Caller already holds s_umount sem */
6365	if (test_opt(sb, DELALLOC))
6366		sync_filesystem(sb);
6367
6368	if (!inode || !igrab(inode))
6369		goto out;
6370
6371	err = dquot_quota_off(sb, type);
6372	if (err || ext4_has_feature_quota(sb))
6373		goto out_put;
6374
6375	inode_lock(inode);
6376	/*
6377	 * Update modification times of quota files when userspace can
6378	 * start looking at them. If we fail, we return success anyway since
6379	 * this is not a hard failure and quotas are already disabled.
6380	 */
6381	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6382	if (IS_ERR(handle)) {
6383		err = PTR_ERR(handle);
6384		goto out_unlock;
6385	}
6386	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
6387	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
6388	inode->i_mtime = inode->i_ctime = current_time(inode);
6389	err = ext4_mark_inode_dirty(handle, inode);
6390	ext4_journal_stop(handle);
6391out_unlock:
6392	inode_unlock(inode);
6393out_put:
6394	lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
6395	iput(inode);
6396	return err;
6397out:
6398	return dquot_quota_off(sb, type);
6399}
6400
6401/* Read data from quotafile - avoid pagecache and such because we cannot afford
6402 * acquiring the locks... As quota files are never truncated and quota code
6403 * itself serializes the operations (and no one else should touch the files)
6404 * we don't have to be afraid of races */
6405static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
6406			       size_t len, loff_t off)
6407{
6408	struct inode *inode = sb_dqopt(sb)->files[type];
6409	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6410	int offset = off & (sb->s_blocksize - 1);
6411	int tocopy;
6412	size_t toread;
6413	struct buffer_head *bh;
6414	loff_t i_size = i_size_read(inode);
6415
6416	if (off > i_size)
6417		return 0;
6418	if (off+len > i_size)
6419		len = i_size-off;
6420	toread = len;
6421	while (toread > 0) {
6422		tocopy = sb->s_blocksize - offset < toread ?
6423				sb->s_blocksize - offset : toread;
6424		bh = ext4_bread(NULL, inode, blk, 0);
6425		if (IS_ERR(bh))
6426			return PTR_ERR(bh);
6427		if (!bh)	/* A hole? */
6428			memset(data, 0, tocopy);
6429		else
6430			memcpy(data, bh->b_data+offset, tocopy);
6431		brelse(bh);
6432		offset = 0;
6433		toread -= tocopy;
6434		data += tocopy;
6435		blk++;
6436	}
6437	return len;
6438}
6439
6440/* Write to quotafile (we know the transaction is already started and has
6441 * enough credits) */
6442static ssize_t ext4_quota_write(struct super_block *sb, int type,
6443				const char *data, size_t len, loff_t off)
6444{
6445	struct inode *inode = sb_dqopt(sb)->files[type];
6446	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6447	int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
6448	int retries = 0;
6449	struct buffer_head *bh;
6450	handle_t *handle = journal_current_handle();
6451
6452	if (EXT4_SB(sb)->s_journal && !handle) {
6453		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6454			" cancelled because transaction is not started",
6455			(unsigned long long)off, (unsigned long long)len);
6456		return -EIO;
6457	}
6458	/*
6459	 * Since we account only one data block in transaction credits,
6460	 * then it is impossible to cross a block boundary.
6461	 */
6462	if (sb->s_blocksize - offset < len) {
6463		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6464			" cancelled because not block aligned",
6465			(unsigned long long)off, (unsigned long long)len);
6466		return -EIO;
6467	}
6468
6469	do {
6470		bh = ext4_bread(handle, inode, blk,
6471				EXT4_GET_BLOCKS_CREATE |
6472				EXT4_GET_BLOCKS_METADATA_NOFAIL);
6473	} while (PTR_ERR(bh) == -ENOSPC &&
6474		 ext4_should_retry_alloc(inode->i_sb, &retries));
6475	if (IS_ERR(bh))
6476		return PTR_ERR(bh);
6477	if (!bh)
6478		goto out;
6479	BUFFER_TRACE(bh, "get write access");
6480	err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
6481	if (err) {
6482		brelse(bh);
6483		return err;
6484	}
6485	lock_buffer(bh);
6486	memcpy(bh->b_data+offset, data, len);
6487	flush_dcache_page(bh->b_page);
6488	unlock_buffer(bh);
6489	err = ext4_handle_dirty_metadata(handle, NULL, bh);
6490	brelse(bh);
6491out:
6492	if (inode->i_size < off + len) {
6493		i_size_write(inode, off + len);
6494		EXT4_I(inode)->i_disksize = inode->i_size;
6495		err2 = ext4_mark_inode_dirty(handle, inode);
6496		if (unlikely(err2 && !err))
6497			err = err2;
6498	}
6499	return err ? err : len;
6500}
6501#endif
6502
6503static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
6504		       const char *dev_name, void *data)
6505{
6506	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
6507}
6508
6509#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
6510static inline void register_as_ext2(void)
6511{
6512	int err = register_filesystem(&ext2_fs_type);
6513	if (err)
6514		printk(KERN_WARNING
6515		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
6516}
6517
6518static inline void unregister_as_ext2(void)
6519{
6520	unregister_filesystem(&ext2_fs_type);
6521}
6522
6523static inline int ext2_feature_set_ok(struct super_block *sb)
6524{
6525	if (ext4_has_unknown_ext2_incompat_features(sb))
6526		return 0;
6527	if (sb_rdonly(sb))
6528		return 1;
6529	if (ext4_has_unknown_ext2_ro_compat_features(sb))
6530		return 0;
6531	return 1;
6532}
6533#else
6534static inline void register_as_ext2(void) { }
6535static inline void unregister_as_ext2(void) { }
6536static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
6537#endif
6538
6539static inline void register_as_ext3(void)
6540{
6541	int err = register_filesystem(&ext3_fs_type);
6542	if (err)
6543		printk(KERN_WARNING
6544		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
6545}
6546
6547static inline void unregister_as_ext3(void)
6548{
6549	unregister_filesystem(&ext3_fs_type);
6550}
6551
6552static inline int ext3_feature_set_ok(struct super_block *sb)
6553{
6554	if (ext4_has_unknown_ext3_incompat_features(sb))
6555		return 0;
6556	if (!ext4_has_feature_journal(sb))
6557		return 0;
6558	if (sb_rdonly(sb))
6559		return 1;
6560	if (ext4_has_unknown_ext3_ro_compat_features(sb))
6561		return 0;
6562	return 1;
6563}
6564
6565static struct file_system_type ext4_fs_type = {
6566	.owner		= THIS_MODULE,
6567	.name		= "ext4",
6568	.mount		= ext4_mount,
6569	.kill_sb	= kill_block_super,
6570	.fs_flags	= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
6571};
6572MODULE_ALIAS_FS("ext4");
6573
6574/* Shared across all ext4 file systems */
6575wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
6576
6577static int __init ext4_init_fs(void)
6578{
6579	int i, err;
6580
6581	ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
6582	ext4_li_info = NULL;
6583
6584	/* Build-time check for flags consistency */
6585	ext4_check_flag_values();
6586
6587	for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
6588		init_waitqueue_head(&ext4__ioend_wq[i]);
6589
6590	err = ext4_init_es();
6591	if (err)
6592		return err;
6593
6594	err = ext4_init_pending();
6595	if (err)
6596		goto out7;
6597
6598	err = ext4_init_post_read_processing();
6599	if (err)
6600		goto out6;
6601
6602	err = ext4_init_pageio();
6603	if (err)
6604		goto out5;
6605
6606	err = ext4_init_system_zone();
6607	if (err)
6608		goto out4;
6609
6610	err = ext4_init_sysfs();
6611	if (err)
6612		goto out3;
6613
6614	err = ext4_init_mballoc();
6615	if (err)
6616		goto out2;
6617	err = init_inodecache();
6618	if (err)
6619		goto out1;
6620
6621	err = ext4_fc_init_dentry_cache();
6622	if (err)
6623		goto out05;
6624
6625	register_as_ext3();
6626	register_as_ext2();
6627	err = register_filesystem(&ext4_fs_type);
6628	if (err)
6629		goto out;
6630
6631	return 0;
6632out:
6633	unregister_as_ext2();
6634	unregister_as_ext3();
6635out05:
6636	destroy_inodecache();
6637out1:
6638	ext4_exit_mballoc();
6639out2:
6640	ext4_exit_sysfs();
6641out3:
6642	ext4_exit_system_zone();
6643out4:
6644	ext4_exit_pageio();
6645out5:
6646	ext4_exit_post_read_processing();
6647out6:
6648	ext4_exit_pending();
6649out7:
6650	ext4_exit_es();
6651
6652	return err;
6653}
6654
6655static void __exit ext4_exit_fs(void)
6656{
6657	ext4_destroy_lazyinit_thread();
6658	unregister_as_ext2();
6659	unregister_as_ext3();
6660	unregister_filesystem(&ext4_fs_type);
6661	destroy_inodecache();
6662	ext4_exit_mballoc();
6663	ext4_exit_sysfs();
6664	ext4_exit_system_zone();
6665	ext4_exit_pageio();
6666	ext4_exit_post_read_processing();
6667	ext4_exit_es();
6668	ext4_exit_pending();
6669}
6670
6671MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
6672MODULE_DESCRIPTION("Fourth Extended Filesystem");
6673MODULE_LICENSE("GPL");
6674MODULE_SOFTDEP("pre: crc32c");
6675module_init(ext4_init_fs)
6676module_exit(ext4_exit_fs)
Configure Feed

Configure Feed