fs/ext4/super.c at v3.18-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ext4 / super.c
at v3.18-rc2 5618 lines 162 kB view raw
   1/*
   2 *  linux/fs/ext4/super.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  from
  10 *
  11 *  linux/fs/minix/inode.c
  12 *
  13 *  Copyright (C) 1991, 1992  Linus Torvalds
  14 *
  15 *  Big-endian to little-endian byte-swapping/bitmaps by
  16 *        David S. Miller (davem@caip.rutgers.edu), 1995
  17 */
  18
  19#include <linux/module.h>
  20#include <linux/string.h>
  21#include <linux/fs.h>
  22#include <linux/time.h>
  23#include <linux/vmalloc.h>
  24#include <linux/jbd2.h>
  25#include <linux/slab.h>
  26#include <linux/init.h>
  27#include <linux/blkdev.h>
  28#include <linux/parser.h>
  29#include <linux/buffer_head.h>
  30#include <linux/exportfs.h>
  31#include <linux/vfs.h>
  32#include <linux/random.h>
  33#include <linux/mount.h>
  34#include <linux/namei.h>
  35#include <linux/quotaops.h>
  36#include <linux/seq_file.h>
  37#include <linux/proc_fs.h>
  38#include <linux/ctype.h>
  39#include <linux/log2.h>
  40#include <linux/crc16.h>
  41#include <linux/cleancache.h>
  42#include <asm/uaccess.h>
  43
  44#include <linux/kthread.h>
  45#include <linux/freezer.h>
  46
  47#include "ext4.h"
  48#include "ext4_extents.h"	/* Needed for trace points definition */
  49#include "ext4_jbd2.h"
  50#include "xattr.h"
  51#include "acl.h"
  52#include "mballoc.h"
  53
  54#define CREATE_TRACE_POINTS
  55#include <trace/events/ext4.h>
  56
  57static struct proc_dir_entry *ext4_proc_root;
  58static struct kset *ext4_kset;
  59static struct ext4_lazy_init *ext4_li_info;
  60static struct mutex ext4_li_mtx;
  61static struct ext4_features *ext4_feat;
  62static int ext4_mballoc_ready;
  63
  64static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  65			     unsigned long journal_devnum);
  66static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  67static int ext4_commit_super(struct super_block *sb, int sync);
  68static void ext4_mark_recovery_complete(struct super_block *sb,
  69					struct ext4_super_block *es);
  70static void ext4_clear_journal_err(struct super_block *sb,
  71				   struct ext4_super_block *es);
  72static int ext4_sync_fs(struct super_block *sb, int wait);
  73static int ext4_remount(struct super_block *sb, int *flags, char *data);
  74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  75static int ext4_unfreeze(struct super_block *sb);
  76static int ext4_freeze(struct super_block *sb);
  77static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  78		       const char *dev_name, void *data);
  79static inline int ext2_feature_set_ok(struct super_block *sb);
  80static inline int ext3_feature_set_ok(struct super_block *sb);
  81static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  82static void ext4_destroy_lazyinit_thread(void);
  83static void ext4_unregister_li_request(struct super_block *sb);
  84static void ext4_clear_request_list(void);
  85static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
  86
  87#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  88static struct file_system_type ext2_fs_type = {
  89	.owner		= THIS_MODULE,
  90	.name		= "ext2",
  91	.mount		= ext4_mount,
  92	.kill_sb	= kill_block_super,
  93	.fs_flags	= FS_REQUIRES_DEV,
  94};
  95MODULE_ALIAS_FS("ext2");
  96MODULE_ALIAS("ext2");
  97#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
  98#else
  99#define IS_EXT2_SB(sb) (0)
 100#endif
 101
 102
 103#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 104static struct file_system_type ext3_fs_type = {
 105	.owner		= THIS_MODULE,
 106	.name		= "ext3",
 107	.mount		= ext4_mount,
 108	.kill_sb	= kill_block_super,
 109	.fs_flags	= FS_REQUIRES_DEV,
 110};
 111MODULE_ALIAS_FS("ext3");
 112MODULE_ALIAS("ext3");
 113#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 114#else
 115#define IS_EXT3_SB(sb) (0)
 116#endif
 117
 118static int ext4_verify_csum_type(struct super_block *sb,
 119				 struct ext4_super_block *es)
 120{
 121	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 122					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 123		return 1;
 124
 125	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 126}
 127
 128static __le32 ext4_superblock_csum(struct super_block *sb,
 129				   struct ext4_super_block *es)
 130{
 131	struct ext4_sb_info *sbi = EXT4_SB(sb);
 132	int offset = offsetof(struct ext4_super_block, s_checksum);
 133	__u32 csum;
 134
 135	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 136
 137	return cpu_to_le32(csum);
 138}
 139
 140static int ext4_superblock_csum_verify(struct super_block *sb,
 141				       struct ext4_super_block *es)
 142{
 143	if (!ext4_has_metadata_csum(sb))
 144		return 1;
 145
 146	return es->s_checksum == ext4_superblock_csum(sb, es);
 147}
 148
 149void ext4_superblock_csum_set(struct super_block *sb)
 150{
 151	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 152
 153	if (!ext4_has_metadata_csum(sb))
 154		return;
 155
 156	es->s_checksum = ext4_superblock_csum(sb, es);
 157}
 158
 159void *ext4_kvmalloc(size_t size, gfp_t flags)
 160{
 161	void *ret;
 162
 163	ret = kmalloc(size, flags | __GFP_NOWARN);
 164	if (!ret)
 165		ret = __vmalloc(size, flags, PAGE_KERNEL);
 166	return ret;
 167}
 168
 169void *ext4_kvzalloc(size_t size, gfp_t flags)
 170{
 171	void *ret;
 172
 173	ret = kzalloc(size, flags | __GFP_NOWARN);
 174	if (!ret)
 175		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
 176	return ret;
 177}
 178
 179void ext4_kvfree(void *ptr)
 180{
 181	if (is_vmalloc_addr(ptr))
 182		vfree(ptr);
 183	else
 184		kfree(ptr);
 185
 186}
 187
 188ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 189			       struct ext4_group_desc *bg)
 190{
 191	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 192		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 193		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 194}
 195
 196ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 197			       struct ext4_group_desc *bg)
 198{
 199	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 200		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 201		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 202}
 203
 204ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 205			      struct ext4_group_desc *bg)
 206{
 207	return le32_to_cpu(bg->bg_inode_table_lo) |
 208		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 209		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 210}
 211
 212__u32 ext4_free_group_clusters(struct super_block *sb,
 213			       struct ext4_group_desc *bg)
 214{
 215	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 216		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 217		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 218}
 219
 220__u32 ext4_free_inodes_count(struct super_block *sb,
 221			      struct ext4_group_desc *bg)
 222{
 223	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 224		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 225		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 226}
 227
 228__u32 ext4_used_dirs_count(struct super_block *sb,
 229			      struct ext4_group_desc *bg)
 230{
 231	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 232		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 233		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 234}
 235
 236__u32 ext4_itable_unused_count(struct super_block *sb,
 237			      struct ext4_group_desc *bg)
 238{
 239	return le16_to_cpu(bg->bg_itable_unused_lo) |
 240		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 241		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 242}
 243
 244void ext4_block_bitmap_set(struct super_block *sb,
 245			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 246{
 247	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 248	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 249		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 250}
 251
 252void ext4_inode_bitmap_set(struct super_block *sb,
 253			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 254{
 255	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 256	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 257		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 258}
 259
 260void ext4_inode_table_set(struct super_block *sb,
 261			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
 262{
 263	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 264	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 265		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 266}
 267
 268void ext4_free_group_clusters_set(struct super_block *sb,
 269				  struct ext4_group_desc *bg, __u32 count)
 270{
 271	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 272	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 273		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 274}
 275
 276void ext4_free_inodes_set(struct super_block *sb,
 277			  struct ext4_group_desc *bg, __u32 count)
 278{
 279	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 280	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 281		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 282}
 283
 284void ext4_used_dirs_set(struct super_block *sb,
 285			  struct ext4_group_desc *bg, __u32 count)
 286{
 287	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 288	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 289		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 290}
 291
 292void ext4_itable_unused_set(struct super_block *sb,
 293			  struct ext4_group_desc *bg, __u32 count)
 294{
 295	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 296	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 297		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 298}
 299
 300
 301static void __save_error_info(struct super_block *sb, const char *func,
 302			    unsigned int line)
 303{
 304	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 305
 306	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 307	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 308	es->s_last_error_time = cpu_to_le32(get_seconds());
 309	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
 310	es->s_last_error_line = cpu_to_le32(line);
 311	if (!es->s_first_error_time) {
 312		es->s_first_error_time = es->s_last_error_time;
 313		strncpy(es->s_first_error_func, func,
 314			sizeof(es->s_first_error_func));
 315		es->s_first_error_line = cpu_to_le32(line);
 316		es->s_first_error_ino = es->s_last_error_ino;
 317		es->s_first_error_block = es->s_last_error_block;
 318	}
 319	/*
 320	 * Start the daily error reporting function if it hasn't been
 321	 * started already
 322	 */
 323	if (!es->s_error_count)
 324		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
 325	le32_add_cpu(&es->s_error_count, 1);
 326}
 327
 328static void save_error_info(struct super_block *sb, const char *func,
 329			    unsigned int line)
 330{
 331	__save_error_info(sb, func, line);
 332	ext4_commit_super(sb, 1);
 333}
 334
 335/*
 336 * The del_gendisk() function uninitializes the disk-specific data
 337 * structures, including the bdi structure, without telling anyone
 338 * else.  Once this happens, any attempt to call mark_buffer_dirty()
 339 * (for example, by ext4_commit_super), will cause a kernel OOPS.
 340 * This is a kludge to prevent these oops until we can put in a proper
 341 * hook in del_gendisk() to inform the VFS and file system layers.
 342 */
 343static int block_device_ejected(struct super_block *sb)
 344{
 345	struct inode *bd_inode = sb->s_bdev->bd_inode;
 346	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
 347
 348	return bdi->dev == NULL;
 349}
 350
 351static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 352{
 353	struct super_block		*sb = journal->j_private;
 354	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 355	int				error = is_journal_aborted(journal);
 356	struct ext4_journal_cb_entry	*jce;
 357
 358	BUG_ON(txn->t_state == T_FINISHED);
 359	spin_lock(&sbi->s_md_lock);
 360	while (!list_empty(&txn->t_private_list)) {
 361		jce = list_entry(txn->t_private_list.next,
 362				 struct ext4_journal_cb_entry, jce_list);
 363		list_del_init(&jce->jce_list);
 364		spin_unlock(&sbi->s_md_lock);
 365		jce->jce_func(sb, jce, error);
 366		spin_lock(&sbi->s_md_lock);
 367	}
 368	spin_unlock(&sbi->s_md_lock);
 369}
 370
 371/* Deal with the reporting of failure conditions on a filesystem such as
 372 * inconsistencies detected or read IO failures.
 373 *
 374 * On ext2, we can store the error state of the filesystem in the
 375 * superblock.  That is not possible on ext4, because we may have other
 376 * write ordering constraints on the superblock which prevent us from
 377 * writing it out straight away; and given that the journal is about to
 378 * be aborted, we can't rely on the current, or future, transactions to
 379 * write out the superblock safely.
 380 *
 381 * We'll just use the jbd2_journal_abort() error code to record an error in
 382 * the journal instead.  On recovery, the journal will complain about
 383 * that error until we've noted it down and cleared it.
 384 */
 385
 386static void ext4_handle_error(struct super_block *sb)
 387{
 388	if (sb->s_flags & MS_RDONLY)
 389		return;
 390
 391	if (!test_opt(sb, ERRORS_CONT)) {
 392		journal_t *journal = EXT4_SB(sb)->s_journal;
 393
 394		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 395		if (journal)
 396			jbd2_journal_abort(journal, -EIO);
 397	}
 398	if (test_opt(sb, ERRORS_RO)) {
 399		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 400		/*
 401		 * Make sure updated value of ->s_mount_flags will be visible
 402		 * before ->s_flags update
 403		 */
 404		smp_wmb();
 405		sb->s_flags |= MS_RDONLY;
 406	}
 407	if (test_opt(sb, ERRORS_PANIC))
 408		panic("EXT4-fs (device %s): panic forced after error\n",
 409			sb->s_id);
 410}
 411
 412#define ext4_error_ratelimit(sb)					\
 413		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
 414			     "EXT4-fs error")
 415
 416void __ext4_error(struct super_block *sb, const char *function,
 417		  unsigned int line, const char *fmt, ...)
 418{
 419	struct va_format vaf;
 420	va_list args;
 421
 422	if (ext4_error_ratelimit(sb)) {
 423		va_start(args, fmt);
 424		vaf.fmt = fmt;
 425		vaf.va = &args;
 426		printk(KERN_CRIT
 427		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 428		       sb->s_id, function, line, current->comm, &vaf);
 429		va_end(args);
 430	}
 431	save_error_info(sb, function, line);
 432	ext4_handle_error(sb);
 433}
 434
 435void __ext4_error_inode(struct inode *inode, const char *function,
 436			unsigned int line, ext4_fsblk_t block,
 437			const char *fmt, ...)
 438{
 439	va_list args;
 440	struct va_format vaf;
 441	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 442
 443	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 444	es->s_last_error_block = cpu_to_le64(block);
 445	if (ext4_error_ratelimit(inode->i_sb)) {
 446		va_start(args, fmt);
 447		vaf.fmt = fmt;
 448		vaf.va = &args;
 449		if (block)
 450			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 451			       "inode #%lu: block %llu: comm %s: %pV\n",
 452			       inode->i_sb->s_id, function, line, inode->i_ino,
 453			       block, current->comm, &vaf);
 454		else
 455			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 456			       "inode #%lu: comm %s: %pV\n",
 457			       inode->i_sb->s_id, function, line, inode->i_ino,
 458			       current->comm, &vaf);
 459		va_end(args);
 460	}
 461	save_error_info(inode->i_sb, function, line);
 462	ext4_handle_error(inode->i_sb);
 463}
 464
 465void __ext4_error_file(struct file *file, const char *function,
 466		       unsigned int line, ext4_fsblk_t block,
 467		       const char *fmt, ...)
 468{
 469	va_list args;
 470	struct va_format vaf;
 471	struct ext4_super_block *es;
 472	struct inode *inode = file_inode(file);
 473	char pathname[80], *path;
 474
 475	es = EXT4_SB(inode->i_sb)->s_es;
 476	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 477	if (ext4_error_ratelimit(inode->i_sb)) {
 478		path = d_path(&(file->f_path), pathname, sizeof(pathname));
 479		if (IS_ERR(path))
 480			path = "(unknown)";
 481		va_start(args, fmt);
 482		vaf.fmt = fmt;
 483		vaf.va = &args;
 484		if (block)
 485			printk(KERN_CRIT
 486			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 487			       "block %llu: comm %s: path %s: %pV\n",
 488			       inode->i_sb->s_id, function, line, inode->i_ino,
 489			       block, current->comm, path, &vaf);
 490		else
 491			printk(KERN_CRIT
 492			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 493			       "comm %s: path %s: %pV\n",
 494			       inode->i_sb->s_id, function, line, inode->i_ino,
 495			       current->comm, path, &vaf);
 496		va_end(args);
 497	}
 498	save_error_info(inode->i_sb, function, line);
 499	ext4_handle_error(inode->i_sb);
 500}
 501
 502const char *ext4_decode_error(struct super_block *sb, int errno,
 503			      char nbuf[16])
 504{
 505	char *errstr = NULL;
 506
 507	switch (errno) {
 508	case -EIO:
 509		errstr = "IO failure";
 510		break;
 511	case -ENOMEM:
 512		errstr = "Out of memory";
 513		break;
 514	case -EROFS:
 515		if (!sb || (EXT4_SB(sb)->s_journal &&
 516			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 517			errstr = "Journal has aborted";
 518		else
 519			errstr = "Readonly filesystem";
 520		break;
 521	default:
 522		/* If the caller passed in an extra buffer for unknown
 523		 * errors, textualise them now.  Else we just return
 524		 * NULL. */
 525		if (nbuf) {
 526			/* Check for truncated error codes... */
 527			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 528				errstr = nbuf;
 529		}
 530		break;
 531	}
 532
 533	return errstr;
 534}
 535
 536/* __ext4_std_error decodes expected errors from journaling functions
 537 * automatically and invokes the appropriate error response.  */
 538
 539void __ext4_std_error(struct super_block *sb, const char *function,
 540		      unsigned int line, int errno)
 541{
 542	char nbuf[16];
 543	const char *errstr;
 544
 545	/* Special case: if the error is EROFS, and we're not already
 546	 * inside a transaction, then there's really no point in logging
 547	 * an error. */
 548	if (errno == -EROFS && journal_current_handle() == NULL &&
 549	    (sb->s_flags & MS_RDONLY))
 550		return;
 551
 552	if (ext4_error_ratelimit(sb)) {
 553		errstr = ext4_decode_error(sb, errno, nbuf);
 554		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 555		       sb->s_id, function, line, errstr);
 556	}
 557
 558	save_error_info(sb, function, line);
 559	ext4_handle_error(sb);
 560}
 561
 562/*
 563 * ext4_abort is a much stronger failure handler than ext4_error.  The
 564 * abort function may be used to deal with unrecoverable failures such
 565 * as journal IO errors or ENOMEM at a critical moment in log management.
 566 *
 567 * We unconditionally force the filesystem into an ABORT|READONLY state,
 568 * unless the error response on the fs has been set to panic in which
 569 * case we take the easy way out and panic immediately.
 570 */
 571
 572void __ext4_abort(struct super_block *sb, const char *function,
 573		unsigned int line, const char *fmt, ...)
 574{
 575	va_list args;
 576
 577	save_error_info(sb, function, line);
 578	va_start(args, fmt);
 579	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
 580	       function, line);
 581	vprintk(fmt, args);
 582	printk("\n");
 583	va_end(args);
 584
 585	if ((sb->s_flags & MS_RDONLY) == 0) {
 586		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 587		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 588		/*
 589		 * Make sure updated value of ->s_mount_flags will be visible
 590		 * before ->s_flags update
 591		 */
 592		smp_wmb();
 593		sb->s_flags |= MS_RDONLY;
 594		if (EXT4_SB(sb)->s_journal)
 595			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 596		save_error_info(sb, function, line);
 597	}
 598	if (test_opt(sb, ERRORS_PANIC))
 599		panic("EXT4-fs panic from previous error\n");
 600}
 601
 602void __ext4_msg(struct super_block *sb,
 603		const char *prefix, const char *fmt, ...)
 604{
 605	struct va_format vaf;
 606	va_list args;
 607
 608	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 609		return;
 610
 611	va_start(args, fmt);
 612	vaf.fmt = fmt;
 613	vaf.va = &args;
 614	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 615	va_end(args);
 616}
 617
 618void __ext4_warning(struct super_block *sb, const char *function,
 619		    unsigned int line, const char *fmt, ...)
 620{
 621	struct va_format vaf;
 622	va_list args;
 623
 624	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 625			  "EXT4-fs warning"))
 626		return;
 627
 628	va_start(args, fmt);
 629	vaf.fmt = fmt;
 630	vaf.va = &args;
 631	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 632	       sb->s_id, function, line, &vaf);
 633	va_end(args);
 634}
 635
 636void __ext4_grp_locked_error(const char *function, unsigned int line,
 637			     struct super_block *sb, ext4_group_t grp,
 638			     unsigned long ino, ext4_fsblk_t block,
 639			     const char *fmt, ...)
 640__releases(bitlock)
 641__acquires(bitlock)
 642{
 643	struct va_format vaf;
 644	va_list args;
 645	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 646
 647	es->s_last_error_ino = cpu_to_le32(ino);
 648	es->s_last_error_block = cpu_to_le64(block);
 649	__save_error_info(sb, function, line);
 650
 651	if (ext4_error_ratelimit(sb)) {
 652		va_start(args, fmt);
 653		vaf.fmt = fmt;
 654		vaf.va = &args;
 655		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 656		       sb->s_id, function, line, grp);
 657		if (ino)
 658			printk(KERN_CONT "inode %lu: ", ino);
 659		if (block)
 660			printk(KERN_CONT "block %llu:",
 661			       (unsigned long long) block);
 662		printk(KERN_CONT "%pV\n", &vaf);
 663		va_end(args);
 664	}
 665
 666	if (test_opt(sb, ERRORS_CONT)) {
 667		ext4_commit_super(sb, 0);
 668		return;
 669	}
 670
 671	ext4_unlock_group(sb, grp);
 672	ext4_handle_error(sb);
 673	/*
 674	 * We only get here in the ERRORS_RO case; relocking the group
 675	 * may be dangerous, but nothing bad will happen since the
 676	 * filesystem will have already been marked read/only and the
 677	 * journal has been aborted.  We return 1 as a hint to callers
 678	 * who might what to use the return value from
 679	 * ext4_grp_locked_error() to distinguish between the
 680	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
 681	 * aggressively from the ext4 function in question, with a
 682	 * more appropriate error code.
 683	 */
 684	ext4_lock_group(sb, grp);
 685	return;
 686}
 687
 688void ext4_update_dynamic_rev(struct super_block *sb)
 689{
 690	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 691
 692	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 693		return;
 694
 695	ext4_warning(sb,
 696		     "updating to rev %d because of new feature flag, "
 697		     "running e2fsck is recommended",
 698		     EXT4_DYNAMIC_REV);
 699
 700	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
 701	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
 702	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
 703	/* leave es->s_feature_*compat flags alone */
 704	/* es->s_uuid will be set by e2fsck if empty */
 705
 706	/*
 707	 * The rest of the superblock fields should be zero, and if not it
 708	 * means they are likely already in use, so leave them alone.  We
 709	 * can leave it up to e2fsck to clean up any inconsistencies there.
 710	 */
 711}
 712
 713/*
 714 * Open the external journal device
 715 */
 716static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 717{
 718	struct block_device *bdev;
 719	char b[BDEVNAME_SIZE];
 720
 721	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 722	if (IS_ERR(bdev))
 723		goto fail;
 724	return bdev;
 725
 726fail:
 727	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 728			__bdevname(dev, b), PTR_ERR(bdev));
 729	return NULL;
 730}
 731
 732/*
 733 * Release the journal device
 734 */
 735static void ext4_blkdev_put(struct block_device *bdev)
 736{
 737	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 738}
 739
 740static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 741{
 742	struct block_device *bdev;
 743	bdev = sbi->journal_bdev;
 744	if (bdev) {
 745		ext4_blkdev_put(bdev);
 746		sbi->journal_bdev = NULL;
 747	}
 748}
 749
 750static inline struct inode *orphan_list_entry(struct list_head *l)
 751{
 752	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
 753}
 754
 755static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 756{
 757	struct list_head *l;
 758
 759	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
 760		 le32_to_cpu(sbi->s_es->s_last_orphan));
 761
 762	printk(KERN_ERR "sb_info orphan list:\n");
 763	list_for_each(l, &sbi->s_orphan) {
 764		struct inode *inode = orphan_list_entry(l);
 765		printk(KERN_ERR "  "
 766		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 767		       inode->i_sb->s_id, inode->i_ino, inode,
 768		       inode->i_mode, inode->i_nlink,
 769		       NEXT_ORPHAN(inode));
 770	}
 771}
 772
 773static void ext4_put_super(struct super_block *sb)
 774{
 775	struct ext4_sb_info *sbi = EXT4_SB(sb);
 776	struct ext4_super_block *es = sbi->s_es;
 777	int i, err;
 778
 779	ext4_unregister_li_request(sb);
 780	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 781
 782	flush_workqueue(sbi->rsv_conversion_wq);
 783	destroy_workqueue(sbi->rsv_conversion_wq);
 784
 785	if (sbi->s_journal) {
 786		err = jbd2_journal_destroy(sbi->s_journal);
 787		sbi->s_journal = NULL;
 788		if (err < 0)
 789			ext4_abort(sb, "Couldn't clean up the journal");
 790	}
 791
 792	ext4_es_unregister_shrinker(sbi);
 793	del_timer_sync(&sbi->s_err_report);
 794	ext4_release_system_zone(sb);
 795	ext4_mb_release(sb);
 796	ext4_ext_release(sb);
 797	ext4_xattr_put_super(sb);
 798
 799	if (!(sb->s_flags & MS_RDONLY)) {
 800		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 801		es->s_state = cpu_to_le16(sbi->s_mount_state);
 802	}
 803	if (!(sb->s_flags & MS_RDONLY))
 804		ext4_commit_super(sb, 1);
 805
 806	if (sbi->s_proc) {
 807		remove_proc_entry("options", sbi->s_proc);
 808		remove_proc_entry(sb->s_id, ext4_proc_root);
 809	}
 810	kobject_del(&sbi->s_kobj);
 811
 812	for (i = 0; i < sbi->s_gdb_count; i++)
 813		brelse(sbi->s_group_desc[i]);
 814	ext4_kvfree(sbi->s_group_desc);
 815	ext4_kvfree(sbi->s_flex_groups);
 816	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 817	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 818	percpu_counter_destroy(&sbi->s_dirs_counter);
 819	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 820	brelse(sbi->s_sbh);
 821#ifdef CONFIG_QUOTA
 822	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 823		kfree(sbi->s_qf_names[i]);
 824#endif
 825
 826	/* Debugging code just in case the in-memory inode orphan list
 827	 * isn't empty.  The on-disk one can be non-empty if we've
 828	 * detected an error and taken the fs readonly, but the
 829	 * in-memory list had better be clean by this point. */
 830	if (!list_empty(&sbi->s_orphan))
 831		dump_orphan_list(sb, sbi);
 832	J_ASSERT(list_empty(&sbi->s_orphan));
 833
 834	invalidate_bdev(sb->s_bdev);
 835	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 836		/*
 837		 * Invalidate the journal device's buffers.  We don't want them
 838		 * floating about in memory - the physical journal device may
 839		 * hotswapped, and it breaks the `ro-after' testing code.
 840		 */
 841		sync_blockdev(sbi->journal_bdev);
 842		invalidate_bdev(sbi->journal_bdev);
 843		ext4_blkdev_remove(sbi);
 844	}
 845	if (sbi->s_mb_cache) {
 846		ext4_xattr_destroy_cache(sbi->s_mb_cache);
 847		sbi->s_mb_cache = NULL;
 848	}
 849	if (sbi->s_mmp_tsk)
 850		kthread_stop(sbi->s_mmp_tsk);
 851	sb->s_fs_info = NULL;
 852	/*
 853	 * Now that we are completely done shutting down the
 854	 * superblock, we need to actually destroy the kobject.
 855	 */
 856	kobject_put(&sbi->s_kobj);
 857	wait_for_completion(&sbi->s_kobj_unregister);
 858	if (sbi->s_chksum_driver)
 859		crypto_free_shash(sbi->s_chksum_driver);
 860	kfree(sbi->s_blockgroup_lock);
 861	kfree(sbi);
 862}
 863
 864static struct kmem_cache *ext4_inode_cachep;
 865
 866/*
 867 * Called inside transaction, so use GFP_NOFS
 868 */
 869static struct inode *ext4_alloc_inode(struct super_block *sb)
 870{
 871	struct ext4_inode_info *ei;
 872
 873	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 874	if (!ei)
 875		return NULL;
 876
 877	ei->vfs_inode.i_version = 1;
 878	spin_lock_init(&ei->i_raw_lock);
 879	INIT_LIST_HEAD(&ei->i_prealloc_list);
 880	spin_lock_init(&ei->i_prealloc_lock);
 881	ext4_es_init_tree(&ei->i_es_tree);
 882	rwlock_init(&ei->i_es_lock);
 883	INIT_LIST_HEAD(&ei->i_es_lru);
 884	ei->i_es_all_nr = 0;
 885	ei->i_es_lru_nr = 0;
 886	ei->i_touch_when = 0;
 887	ei->i_reserved_data_blocks = 0;
 888	ei->i_reserved_meta_blocks = 0;
 889	ei->i_allocated_meta_blocks = 0;
 890	ei->i_da_metadata_calc_len = 0;
 891	ei->i_da_metadata_calc_last_lblock = 0;
 892	spin_lock_init(&(ei->i_block_reservation_lock));
 893#ifdef CONFIG_QUOTA
 894	ei->i_reserved_quota = 0;
 895#endif
 896	ei->jinode = NULL;
 897	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
 898	spin_lock_init(&ei->i_completed_io_lock);
 899	ei->i_sync_tid = 0;
 900	ei->i_datasync_tid = 0;
 901	atomic_set(&ei->i_ioend_count, 0);
 902	atomic_set(&ei->i_unwritten, 0);
 903	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 904
 905	return &ei->vfs_inode;
 906}
 907
 908static int ext4_drop_inode(struct inode *inode)
 909{
 910	int drop = generic_drop_inode(inode);
 911
 912	trace_ext4_drop_inode(inode, drop);
 913	return drop;
 914}
 915
 916static void ext4_i_callback(struct rcu_head *head)
 917{
 918	struct inode *inode = container_of(head, struct inode, i_rcu);
 919	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 920}
 921
 922static void ext4_destroy_inode(struct inode *inode)
 923{
 924	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
 925		ext4_msg(inode->i_sb, KERN_ERR,
 926			 "Inode %lu (%p): orphan list check failed!",
 927			 inode->i_ino, EXT4_I(inode));
 928		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 929				EXT4_I(inode), sizeof(struct ext4_inode_info),
 930				true);
 931		dump_stack();
 932	}
 933	call_rcu(&inode->i_rcu, ext4_i_callback);
 934}
 935
 936static void init_once(void *foo)
 937{
 938	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 939
 940	INIT_LIST_HEAD(&ei->i_orphan);
 941	init_rwsem(&ei->xattr_sem);
 942	init_rwsem(&ei->i_data_sem);
 943	inode_init_once(&ei->vfs_inode);
 944}
 945
 946static int __init init_inodecache(void)
 947{
 948	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 949					     sizeof(struct ext4_inode_info),
 950					     0, (SLAB_RECLAIM_ACCOUNT|
 951						SLAB_MEM_SPREAD),
 952					     init_once);
 953	if (ext4_inode_cachep == NULL)
 954		return -ENOMEM;
 955	return 0;
 956}
 957
 958static void destroy_inodecache(void)
 959{
 960	/*
 961	 * Make sure all delayed rcu free inodes are flushed before we
 962	 * destroy cache.
 963	 */
 964	rcu_barrier();
 965	kmem_cache_destroy(ext4_inode_cachep);
 966}
 967
 968void ext4_clear_inode(struct inode *inode)
 969{
 970	invalidate_inode_buffers(inode);
 971	clear_inode(inode);
 972	dquot_drop(inode);
 973	ext4_discard_preallocations(inode);
 974	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
 975	ext4_es_lru_del(inode);
 976	if (EXT4_I(inode)->jinode) {
 977		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 978					       EXT4_I(inode)->jinode);
 979		jbd2_free_inode(EXT4_I(inode)->jinode);
 980		EXT4_I(inode)->jinode = NULL;
 981	}
 982}
 983
 984static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 985					u64 ino, u32 generation)
 986{
 987	struct inode *inode;
 988
 989	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
 990		return ERR_PTR(-ESTALE);
 991	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
 992		return ERR_PTR(-ESTALE);
 993
 994	/* iget isn't really right if the inode is currently unallocated!!
 995	 *
 996	 * ext4_read_inode will return a bad_inode if the inode had been
 997	 * deleted, so we should be safe.
 998	 *
 999	 * Currently we don't know the generation for parent directory, so
1000	 * a generation of 0 means "accept any"
1001	 */
1002	inode = ext4_iget_normal(sb, ino);
1003	if (IS_ERR(inode))
1004		return ERR_CAST(inode);
1005	if (generation && inode->i_generation != generation) {
1006		iput(inode);
1007		return ERR_PTR(-ESTALE);
1008	}
1009
1010	return inode;
1011}
1012
1013static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1014					int fh_len, int fh_type)
1015{
1016	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1017				    ext4_nfs_get_inode);
1018}
1019
1020static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1021					int fh_len, int fh_type)
1022{
1023	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1024				    ext4_nfs_get_inode);
1025}
1026
1027/*
1028 * Try to release metadata pages (indirect blocks, directories) which are
1029 * mapped via the block device.  Since these pages could have journal heads
1030 * which would prevent try_to_free_buffers() from freeing them, we must use
1031 * jbd2 layer's try_to_free_buffers() function to release them.
1032 */
1033static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
1034				 gfp_t wait)
1035{
1036	journal_t *journal = EXT4_SB(sb)->s_journal;
1037
1038	WARN_ON(PageChecked(page));
1039	if (!page_has_buffers(page))
1040		return 0;
1041	if (journal)
1042		return jbd2_journal_try_to_free_buffers(journal, page,
1043							wait & ~__GFP_WAIT);
1044	return try_to_free_buffers(page);
1045}
1046
1047#ifdef CONFIG_QUOTA
1048#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
1049#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
1050
1051static int ext4_write_dquot(struct dquot *dquot);
1052static int ext4_acquire_dquot(struct dquot *dquot);
1053static int ext4_release_dquot(struct dquot *dquot);
1054static int ext4_mark_dquot_dirty(struct dquot *dquot);
1055static int ext4_write_info(struct super_block *sb, int type);
1056static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1057			 struct path *path);
1058static int ext4_quota_on_sysfile(struct super_block *sb, int type,
1059				 int format_id);
1060static int ext4_quota_off(struct super_block *sb, int type);
1061static int ext4_quota_off_sysfile(struct super_block *sb, int type);
1062static int ext4_quota_on_mount(struct super_block *sb, int type);
1063static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1064			       size_t len, loff_t off);
1065static ssize_t ext4_quota_write(struct super_block *sb, int type,
1066				const char *data, size_t len, loff_t off);
1067static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1068			     unsigned int flags);
1069static int ext4_enable_quotas(struct super_block *sb);
1070
1071static const struct dquot_operations ext4_quota_operations = {
1072	.get_reserved_space = ext4_get_reserved_space,
1073	.write_dquot	= ext4_write_dquot,
1074	.acquire_dquot	= ext4_acquire_dquot,
1075	.release_dquot	= ext4_release_dquot,
1076	.mark_dirty	= ext4_mark_dquot_dirty,
1077	.write_info	= ext4_write_info,
1078	.alloc_dquot	= dquot_alloc,
1079	.destroy_dquot	= dquot_destroy,
1080};
1081
1082static const struct quotactl_ops ext4_qctl_operations = {
1083	.quota_on	= ext4_quota_on,
1084	.quota_off	= ext4_quota_off,
1085	.quota_sync	= dquot_quota_sync,
1086	.get_info	= dquot_get_dqinfo,
1087	.set_info	= dquot_set_dqinfo,
1088	.get_dqblk	= dquot_get_dqblk,
1089	.set_dqblk	= dquot_set_dqblk
1090};
1091
1092static const struct quotactl_ops ext4_qctl_sysfile_operations = {
1093	.quota_on_meta	= ext4_quota_on_sysfile,
1094	.quota_off	= ext4_quota_off_sysfile,
1095	.quota_sync	= dquot_quota_sync,
1096	.get_info	= dquot_get_dqinfo,
1097	.set_info	= dquot_set_dqinfo,
1098	.get_dqblk	= dquot_get_dqblk,
1099	.set_dqblk	= dquot_set_dqblk
1100};
1101#endif
1102
1103static const struct super_operations ext4_sops = {
1104	.alloc_inode	= ext4_alloc_inode,
1105	.destroy_inode	= ext4_destroy_inode,
1106	.write_inode	= ext4_write_inode,
1107	.dirty_inode	= ext4_dirty_inode,
1108	.drop_inode	= ext4_drop_inode,
1109	.evict_inode	= ext4_evict_inode,
1110	.put_super	= ext4_put_super,
1111	.sync_fs	= ext4_sync_fs,
1112	.freeze_fs	= ext4_freeze,
1113	.unfreeze_fs	= ext4_unfreeze,
1114	.statfs		= ext4_statfs,
1115	.remount_fs	= ext4_remount,
1116	.show_options	= ext4_show_options,
1117#ifdef CONFIG_QUOTA
1118	.quota_read	= ext4_quota_read,
1119	.quota_write	= ext4_quota_write,
1120#endif
1121	.bdev_try_to_free_page = bdev_try_to_free_page,
1122};
1123
1124static const struct export_operations ext4_export_ops = {
1125	.fh_to_dentry = ext4_fh_to_dentry,
1126	.fh_to_parent = ext4_fh_to_parent,
1127	.get_parent = ext4_get_parent,
1128};
1129
1130enum {
1131	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1132	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1133	Opt_nouid32, Opt_debug, Opt_removed,
1134	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1135	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1136	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1137	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1138	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1139	Opt_data_err_abort, Opt_data_err_ignore,
1140	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1141	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1142	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1143	Opt_usrquota, Opt_grpquota, Opt_i_version,
1144	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1145	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1146	Opt_inode_readahead_blks, Opt_journal_ioprio,
1147	Opt_dioread_nolock, Opt_dioread_lock,
1148	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1149	Opt_max_dir_size_kb,
1150};
1151
1152static const match_table_t tokens = {
1153	{Opt_bsd_df, "bsddf"},
1154	{Opt_minix_df, "minixdf"},
1155	{Opt_grpid, "grpid"},
1156	{Opt_grpid, "bsdgroups"},
1157	{Opt_nogrpid, "nogrpid"},
1158	{Opt_nogrpid, "sysvgroups"},
1159	{Opt_resgid, "resgid=%u"},
1160	{Opt_resuid, "resuid=%u"},
1161	{Opt_sb, "sb=%u"},
1162	{Opt_err_cont, "errors=continue"},
1163	{Opt_err_panic, "errors=panic"},
1164	{Opt_err_ro, "errors=remount-ro"},
1165	{Opt_nouid32, "nouid32"},
1166	{Opt_debug, "debug"},
1167	{Opt_removed, "oldalloc"},
1168	{Opt_removed, "orlov"},
1169	{Opt_user_xattr, "user_xattr"},
1170	{Opt_nouser_xattr, "nouser_xattr"},
1171	{Opt_acl, "acl"},
1172	{Opt_noacl, "noacl"},
1173	{Opt_noload, "norecovery"},
1174	{Opt_noload, "noload"},
1175	{Opt_removed, "nobh"},
1176	{Opt_removed, "bh"},
1177	{Opt_commit, "commit=%u"},
1178	{Opt_min_batch_time, "min_batch_time=%u"},
1179	{Opt_max_batch_time, "max_batch_time=%u"},
1180	{Opt_journal_dev, "journal_dev=%u"},
1181	{Opt_journal_path, "journal_path=%s"},
1182	{Opt_journal_checksum, "journal_checksum"},
1183	{Opt_journal_async_commit, "journal_async_commit"},
1184	{Opt_abort, "abort"},
1185	{Opt_data_journal, "data=journal"},
1186	{Opt_data_ordered, "data=ordered"},
1187	{Opt_data_writeback, "data=writeback"},
1188	{Opt_data_err_abort, "data_err=abort"},
1189	{Opt_data_err_ignore, "data_err=ignore"},
1190	{Opt_offusrjquota, "usrjquota="},
1191	{Opt_usrjquota, "usrjquota=%s"},
1192	{Opt_offgrpjquota, "grpjquota="},
1193	{Opt_grpjquota, "grpjquota=%s"},
1194	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1195	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1196	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1197	{Opt_grpquota, "grpquota"},
1198	{Opt_noquota, "noquota"},
1199	{Opt_quota, "quota"},
1200	{Opt_usrquota, "usrquota"},
1201	{Opt_barrier, "barrier=%u"},
1202	{Opt_barrier, "barrier"},
1203	{Opt_nobarrier, "nobarrier"},
1204	{Opt_i_version, "i_version"},
1205	{Opt_stripe, "stripe=%u"},
1206	{Opt_delalloc, "delalloc"},
1207	{Opt_nodelalloc, "nodelalloc"},
1208	{Opt_removed, "mblk_io_submit"},
1209	{Opt_removed, "nomblk_io_submit"},
1210	{Opt_block_validity, "block_validity"},
1211	{Opt_noblock_validity, "noblock_validity"},
1212	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1213	{Opt_journal_ioprio, "journal_ioprio=%u"},
1214	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
1215	{Opt_auto_da_alloc, "auto_da_alloc"},
1216	{Opt_noauto_da_alloc, "noauto_da_alloc"},
1217	{Opt_dioread_nolock, "dioread_nolock"},
1218	{Opt_dioread_lock, "dioread_lock"},
1219	{Opt_discard, "discard"},
1220	{Opt_nodiscard, "nodiscard"},
1221	{Opt_init_itable, "init_itable=%u"},
1222	{Opt_init_itable, "init_itable"},
1223	{Opt_noinit_itable, "noinit_itable"},
1224	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1225	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
1226	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
1227	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
1228	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1229	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
1230	{Opt_err, NULL},
1231};
1232
1233static ext4_fsblk_t get_sb_block(void **data)
1234{
1235	ext4_fsblk_t	sb_block;
1236	char		*options = (char *) *data;
1237
1238	if (!options || strncmp(options, "sb=", 3) != 0)
1239		return 1;	/* Default location */
1240
1241	options += 3;
1242	/* TODO: use simple_strtoll with >32bit ext4 */
1243	sb_block = simple_strtoul(options, &options, 0);
1244	if (*options && *options != ',') {
1245		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1246		       (char *) *data);
1247		return 1;
1248	}
1249	if (*options == ',')
1250		options++;
1251	*data = (void *) options;
1252
1253	return sb_block;
1254}
1255
1256#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1257static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1258	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1259
1260#ifdef CONFIG_QUOTA
1261static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1262{
1263	struct ext4_sb_info *sbi = EXT4_SB(sb);
1264	char *qname;
1265	int ret = -1;
1266
1267	if (sb_any_quota_loaded(sb) &&
1268		!sbi->s_qf_names[qtype]) {
1269		ext4_msg(sb, KERN_ERR,
1270			"Cannot change journaled "
1271			"quota options when quota turned on");
1272		return -1;
1273	}
1274	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
1275		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
1276			 "when QUOTA feature is enabled");
1277		return -1;
1278	}
1279	qname = match_strdup(args);
1280	if (!qname) {
1281		ext4_msg(sb, KERN_ERR,
1282			"Not enough memory for storing quotafile name");
1283		return -1;
1284	}
1285	if (sbi->s_qf_names[qtype]) {
1286		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
1287			ret = 1;
1288		else
1289			ext4_msg(sb, KERN_ERR,
1290				 "%s quota file already specified",
1291				 QTYPE2NAME(qtype));
1292		goto errout;
1293	}
1294	if (strchr(qname, '/')) {
1295		ext4_msg(sb, KERN_ERR,
1296			"quotafile must be on filesystem root");
1297		goto errout;
1298	}
1299	sbi->s_qf_names[qtype] = qname;
1300	set_opt(sb, QUOTA);
1301	return 1;
1302errout:
1303	kfree(qname);
1304	return ret;
1305}
1306
1307static int clear_qf_name(struct super_block *sb, int qtype)
1308{
1309
1310	struct ext4_sb_info *sbi = EXT4_SB(sb);
1311
1312	if (sb_any_quota_loaded(sb) &&
1313		sbi->s_qf_names[qtype]) {
1314		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1315			" when quota turned on");
1316		return -1;
1317	}
1318	kfree(sbi->s_qf_names[qtype]);
1319	sbi->s_qf_names[qtype] = NULL;
1320	return 1;
1321}
1322#endif
1323
1324#define MOPT_SET	0x0001
1325#define MOPT_CLEAR	0x0002
1326#define MOPT_NOSUPPORT	0x0004
1327#define MOPT_EXPLICIT	0x0008
1328#define MOPT_CLEAR_ERR	0x0010
1329#define MOPT_GTE0	0x0020
1330#ifdef CONFIG_QUOTA
1331#define MOPT_Q		0
1332#define MOPT_QFMT	0x0040
1333#else
1334#define MOPT_Q		MOPT_NOSUPPORT
1335#define MOPT_QFMT	MOPT_NOSUPPORT
1336#endif
1337#define MOPT_DATAJ	0x0080
1338#define MOPT_NO_EXT2	0x0100
1339#define MOPT_NO_EXT3	0x0200
1340#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1341#define MOPT_STRING	0x0400
1342
1343static const struct mount_opts {
1344	int	token;
1345	int	mount_opt;
1346	int	flags;
1347} ext4_mount_opts[] = {
1348	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1349	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1350	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1351	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1352	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1353	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1354	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1355	 MOPT_EXT4_ONLY | MOPT_SET},
1356	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1357	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1358	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1359	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1360	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1361	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1362	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1363	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1364	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1365	 MOPT_EXT4_ONLY | MOPT_SET},
1366	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1367				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1368	 MOPT_EXT4_ONLY | MOPT_SET},
1369	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1370	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1371	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1372	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1373	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1374	 MOPT_NO_EXT2 | MOPT_SET},
1375	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1376	 MOPT_NO_EXT2 | MOPT_CLEAR},
1377	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1378	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1379	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1380	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1381	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1382	{Opt_commit, 0, MOPT_GTE0},
1383	{Opt_max_batch_time, 0, MOPT_GTE0},
1384	{Opt_min_batch_time, 0, MOPT_GTE0},
1385	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
1386	{Opt_init_itable, 0, MOPT_GTE0},
1387	{Opt_stripe, 0, MOPT_GTE0},
1388	{Opt_resuid, 0, MOPT_GTE0},
1389	{Opt_resgid, 0, MOPT_GTE0},
1390	{Opt_journal_dev, 0, MOPT_GTE0},
1391	{Opt_journal_path, 0, MOPT_STRING},
1392	{Opt_journal_ioprio, 0, MOPT_GTE0},
1393	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1394	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1395	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1396	 MOPT_NO_EXT2 | MOPT_DATAJ},
1397	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1398	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1399#ifdef CONFIG_EXT4_FS_POSIX_ACL
1400	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1401	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1402#else
1403	{Opt_acl, 0, MOPT_NOSUPPORT},
1404	{Opt_noacl, 0, MOPT_NOSUPPORT},
1405#endif
1406	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1407	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1408	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1409	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1410							MOPT_SET | MOPT_Q},
1411	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1412							MOPT_SET | MOPT_Q},
1413	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1414		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
1415	{Opt_usrjquota, 0, MOPT_Q},
1416	{Opt_grpjquota, 0, MOPT_Q},
1417	{Opt_offusrjquota, 0, MOPT_Q},
1418	{Opt_offgrpjquota, 0, MOPT_Q},
1419	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1420	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1421	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1422	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
1423	{Opt_err, 0, 0}
1424};
1425
1426static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1427			    substring_t *args, unsigned long *journal_devnum,
1428			    unsigned int *journal_ioprio, int is_remount)
1429{
1430	struct ext4_sb_info *sbi = EXT4_SB(sb);
1431	const struct mount_opts *m;
1432	kuid_t uid;
1433	kgid_t gid;
1434	int arg = 0;
1435
1436#ifdef CONFIG_QUOTA
1437	if (token == Opt_usrjquota)
1438		return set_qf_name(sb, USRQUOTA, &args[0]);
1439	else if (token == Opt_grpjquota)
1440		return set_qf_name(sb, GRPQUOTA, &args[0]);
1441	else if (token == Opt_offusrjquota)
1442		return clear_qf_name(sb, USRQUOTA);
1443	else if (token == Opt_offgrpjquota)
1444		return clear_qf_name(sb, GRPQUOTA);
1445#endif
1446	switch (token) {
1447	case Opt_noacl:
1448	case Opt_nouser_xattr:
1449		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1450		break;
1451	case Opt_sb:
1452		return 1;	/* handled by get_sb_block() */
1453	case Opt_removed:
1454		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
1455		return 1;
1456	case Opt_abort:
1457		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1458		return 1;
1459	case Opt_i_version:
1460		sb->s_flags |= MS_I_VERSION;
1461		return 1;
1462	}
1463
1464	for (m = ext4_mount_opts; m->token != Opt_err; m++)
1465		if (token == m->token)
1466			break;
1467
1468	if (m->token == Opt_err) {
1469		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1470			 "or missing value", opt);
1471		return -1;
1472	}
1473
1474	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
1475		ext4_msg(sb, KERN_ERR,
1476			 "Mount option \"%s\" incompatible with ext2", opt);
1477		return -1;
1478	}
1479	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
1480		ext4_msg(sb, KERN_ERR,
1481			 "Mount option \"%s\" incompatible with ext3", opt);
1482		return -1;
1483	}
1484
1485	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
1486		return -1;
1487	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1488		return -1;
1489	if (m->flags & MOPT_EXPLICIT)
1490		set_opt2(sb, EXPLICIT_DELALLOC);
1491	if (m->flags & MOPT_CLEAR_ERR)
1492		clear_opt(sb, ERRORS_MASK);
1493	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1494		ext4_msg(sb, KERN_ERR, "Cannot change quota "
1495			 "options when quota turned on");
1496		return -1;
1497	}
1498
1499	if (m->flags & MOPT_NOSUPPORT) {
1500		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1501	} else if (token == Opt_commit) {
1502		if (arg == 0)
1503			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1504		sbi->s_commit_interval = HZ * arg;
1505	} else if (token == Opt_max_batch_time) {
1506		sbi->s_max_batch_time = arg;
1507	} else if (token == Opt_min_batch_time) {
1508		sbi->s_min_batch_time = arg;
1509	} else if (token == Opt_inode_readahead_blks) {
1510		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
1511			ext4_msg(sb, KERN_ERR,
1512				 "EXT4-fs: inode_readahead_blks must be "
1513				 "0 or a power of 2 smaller than 2^31");
1514			return -1;
1515		}
1516		sbi->s_inode_readahead_blks = arg;
1517	} else if (token == Opt_init_itable) {
1518		set_opt(sb, INIT_INODE_TABLE);
1519		if (!args->from)
1520			arg = EXT4_DEF_LI_WAIT_MULT;
1521		sbi->s_li_wait_mult = arg;
1522	} else if (token == Opt_max_dir_size_kb) {
1523		sbi->s_max_dir_size_kb = arg;
1524	} else if (token == Opt_stripe) {
1525		sbi->s_stripe = arg;
1526	} else if (token == Opt_resuid) {
1527		uid = make_kuid(current_user_ns(), arg);
1528		if (!uid_valid(uid)) {
1529			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
1530			return -1;
1531		}
1532		sbi->s_resuid = uid;
1533	} else if (token == Opt_resgid) {
1534		gid = make_kgid(current_user_ns(), arg);
1535		if (!gid_valid(gid)) {
1536			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
1537			return -1;
1538		}
1539		sbi->s_resgid = gid;
1540	} else if (token == Opt_journal_dev) {
1541		if (is_remount) {
1542			ext4_msg(sb, KERN_ERR,
1543				 "Cannot specify journal on remount");
1544			return -1;
1545		}
1546		*journal_devnum = arg;
1547	} else if (token == Opt_journal_path) {
1548		char *journal_path;
1549		struct inode *journal_inode;
1550		struct path path;
1551		int error;
1552
1553		if (is_remount) {
1554			ext4_msg(sb, KERN_ERR,
1555				 "Cannot specify journal on remount");
1556			return -1;
1557		}
1558		journal_path = match_strdup(&args[0]);
1559		if (!journal_path) {
1560			ext4_msg(sb, KERN_ERR, "error: could not dup "
1561				"journal device string");
1562			return -1;
1563		}
1564
1565		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1566		if (error) {
1567			ext4_msg(sb, KERN_ERR, "error: could not find "
1568				"journal device path: error %d", error);
1569			kfree(journal_path);
1570			return -1;
1571		}
1572
1573		journal_inode = path.dentry->d_inode;
1574		if (!S_ISBLK(journal_inode->i_mode)) {
1575			ext4_msg(sb, KERN_ERR, "error: journal path %s "
1576				"is not a block device", journal_path);
1577			path_put(&path);
1578			kfree(journal_path);
1579			return -1;
1580		}
1581
1582		*journal_devnum = new_encode_dev(journal_inode->i_rdev);
1583		path_put(&path);
1584		kfree(journal_path);
1585	} else if (token == Opt_journal_ioprio) {
1586		if (arg > 7) {
1587			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
1588				 " (must be 0-7)");
1589			return -1;
1590		}
1591		*journal_ioprio =
1592			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1593	} else if (m->flags & MOPT_DATAJ) {
1594		if (is_remount) {
1595			if (!sbi->s_journal)
1596				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1597			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
1598				ext4_msg(sb, KERN_ERR,
1599					 "Cannot change data mode on remount");
1600				return -1;
1601			}
1602		} else {
1603			clear_opt(sb, DATA_FLAGS);
1604			sbi->s_mount_opt |= m->mount_opt;
1605		}
1606#ifdef CONFIG_QUOTA
1607	} else if (m->flags & MOPT_QFMT) {
1608		if (sb_any_quota_loaded(sb) &&
1609		    sbi->s_jquota_fmt != m->mount_opt) {
1610			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
1611				 "quota options when quota turned on");
1612			return -1;
1613		}
1614		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1615					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
1616			ext4_msg(sb, KERN_ERR,
1617				 "Cannot set journaled quota options "
1618				 "when QUOTA feature is enabled");
1619			return -1;
1620		}
1621		sbi->s_jquota_fmt = m->mount_opt;
1622#endif
1623	} else {
1624		if (!args->from)
1625			arg = 1;
1626		if (m->flags & MOPT_CLEAR)
1627			arg = !arg;
1628		else if (unlikely(!(m->flags & MOPT_SET))) {
1629			ext4_msg(sb, KERN_WARNING,
1630				 "buggy handling of option %s", opt);
1631			WARN_ON(1);
1632			return -1;
1633		}
1634		if (arg != 0)
1635			sbi->s_mount_opt |= m->mount_opt;
1636		else
1637			sbi->s_mount_opt &= ~m->mount_opt;
1638	}
1639	return 1;
1640}
1641
1642static int parse_options(char *options, struct super_block *sb,
1643			 unsigned long *journal_devnum,
1644			 unsigned int *journal_ioprio,
1645			 int is_remount)
1646{
1647	struct ext4_sb_info *sbi = EXT4_SB(sb);
1648	char *p;
1649	substring_t args[MAX_OPT_ARGS];
1650	int token;
1651
1652	if (!options)
1653		return 1;
1654
1655	while ((p = strsep(&options, ",")) != NULL) {
1656		if (!*p)
1657			continue;
1658		/*
1659		 * Initialize args struct so we know whether arg was
1660		 * found; some options take optional arguments.
1661		 */
1662		args[0].to = args[0].from = NULL;
1663		token = match_token(p, tokens, args);
1664		if (handle_mount_opt(sb, p, token, args, journal_devnum,
1665				     journal_ioprio, is_remount) < 0)
1666			return 0;
1667	}
1668#ifdef CONFIG_QUOTA
1669	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
1670	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
1671		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
1672			 "feature is enabled");
1673		return 0;
1674	}
1675	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1676		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1677			clear_opt(sb, USRQUOTA);
1678
1679		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1680			clear_opt(sb, GRPQUOTA);
1681
1682		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1683			ext4_msg(sb, KERN_ERR, "old and new quota "
1684					"format mixing");
1685			return 0;
1686		}
1687
1688		if (!sbi->s_jquota_fmt) {
1689			ext4_msg(sb, KERN_ERR, "journaled quota format "
1690					"not specified");
1691			return 0;
1692		}
1693	}
1694#endif
1695	if (test_opt(sb, DIOREAD_NOLOCK)) {
1696		int blocksize =
1697			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
1698
1699		if (blocksize < PAGE_CACHE_SIZE) {
1700			ext4_msg(sb, KERN_ERR, "can't mount with "
1701				 "dioread_nolock if block size != PAGE_SIZE");
1702			return 0;
1703		}
1704	}
1705	return 1;
1706}
1707
1708static inline void ext4_show_quota_options(struct seq_file *seq,
1709					   struct super_block *sb)
1710{
1711#if defined(CONFIG_QUOTA)
1712	struct ext4_sb_info *sbi = EXT4_SB(sb);
1713
1714	if (sbi->s_jquota_fmt) {
1715		char *fmtname = "";
1716
1717		switch (sbi->s_jquota_fmt) {
1718		case QFMT_VFS_OLD:
1719			fmtname = "vfsold";
1720			break;
1721		case QFMT_VFS_V0:
1722			fmtname = "vfsv0";
1723			break;
1724		case QFMT_VFS_V1:
1725			fmtname = "vfsv1";
1726			break;
1727		}
1728		seq_printf(seq, ",jqfmt=%s", fmtname);
1729	}
1730
1731	if (sbi->s_qf_names[USRQUOTA])
1732		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1733
1734	if (sbi->s_qf_names[GRPQUOTA])
1735		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1736#endif
1737}
1738
1739static const char *token2str(int token)
1740{
1741	const struct match_token *t;
1742
1743	for (t = tokens; t->token != Opt_err; t++)
1744		if (t->token == token && !strchr(t->pattern, '='))
1745			break;
1746	return t->pattern;
1747}
1748
1749/*
1750 * Show an option if
1751 *  - it's set to a non-default value OR
1752 *  - if the per-sb default is different from the global default
1753 */
1754static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1755			      int nodefs)
1756{
1757	struct ext4_sb_info *sbi = EXT4_SB(sb);
1758	struct ext4_super_block *es = sbi->s_es;
1759	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
1760	const struct mount_opts *m;
1761	char sep = nodefs ? '\n' : ',';
1762
1763#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
1764#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
1765
1766	if (sbi->s_sb_block != 1)
1767		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
1768
1769	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1770		int want_set = m->flags & MOPT_SET;
1771		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
1772		    (m->flags & MOPT_CLEAR_ERR))
1773			continue;
1774		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
1775			continue; /* skip if same as the default */
1776		if ((want_set &&
1777		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
1778		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
1779			continue; /* select Opt_noFoo vs Opt_Foo */
1780		SEQ_OPTS_PRINT("%s", token2str(m->token));
1781	}
1782
1783	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
1784	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
1785		SEQ_OPTS_PRINT("resuid=%u",
1786				from_kuid_munged(&init_user_ns, sbi->s_resuid));
1787	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
1788	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
1789		SEQ_OPTS_PRINT("resgid=%u",
1790				from_kgid_munged(&init_user_ns, sbi->s_resgid));
1791	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
1792	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
1793		SEQ_OPTS_PUTS("errors=remount-ro");
1794	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1795		SEQ_OPTS_PUTS("errors=continue");
1796	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1797		SEQ_OPTS_PUTS("errors=panic");
1798	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
1799		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
1800	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
1801		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
1802	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
1803		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
1804	if (sb->s_flags & MS_I_VERSION)
1805		SEQ_OPTS_PUTS("i_version");
1806	if (nodefs || sbi->s_stripe)
1807		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
1808	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
1809		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1810			SEQ_OPTS_PUTS("data=journal");
1811		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1812			SEQ_OPTS_PUTS("data=ordered");
1813		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1814			SEQ_OPTS_PUTS("data=writeback");
1815	}
1816	if (nodefs ||
1817	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1818		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
1819			       sbi->s_inode_readahead_blks);
1820
1821	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1822		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1823		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1824	if (nodefs || sbi->s_max_dir_size_kb)
1825		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1826
1827	ext4_show_quota_options(seq, sb);
1828	return 0;
1829}
1830
1831static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1832{
1833	return _ext4_show_options(seq, root->d_sb, 0);
1834}
1835
1836static int options_seq_show(struct seq_file *seq, void *offset)
1837{
1838	struct super_block *sb = seq->private;
1839	int rc;
1840
1841	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
1842	rc = _ext4_show_options(seq, sb, 1);
1843	seq_puts(seq, "\n");
1844	return rc;
1845}
1846
1847static int options_open_fs(struct inode *inode, struct file *file)
1848{
1849	return single_open(file, options_seq_show, PDE_DATA(inode));
1850}
1851
1852static const struct file_operations ext4_seq_options_fops = {
1853	.owner = THIS_MODULE,
1854	.open = options_open_fs,
1855	.read = seq_read,
1856	.llseek = seq_lseek,
1857	.release = single_release,
1858};
1859
1860static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1861			    int read_only)
1862{
1863	struct ext4_sb_info *sbi = EXT4_SB(sb);
1864	int res = 0;
1865
1866	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1867		ext4_msg(sb, KERN_ERR, "revision level too high, "
1868			 "forcing read-only mode");
1869		res = MS_RDONLY;
1870	}
1871	if (read_only)
1872		goto done;
1873	if (!(sbi->s_mount_state & EXT4_VALID_FS))
1874		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1875			 "running e2fsck is recommended");
1876	else if (sbi->s_mount_state & EXT4_ERROR_FS)
1877		ext4_msg(sb, KERN_WARNING,
1878			 "warning: mounting fs with errors, "
1879			 "running e2fsck is recommended");
1880	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1881		 le16_to_cpu(es->s_mnt_count) >=
1882		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1883		ext4_msg(sb, KERN_WARNING,
1884			 "warning: maximal mount count reached, "
1885			 "running e2fsck is recommended");
1886	else if (le32_to_cpu(es->s_checkinterval) &&
1887		(le32_to_cpu(es->s_lastcheck) +
1888			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1889		ext4_msg(sb, KERN_WARNING,
1890			 "warning: checktime reached, "
1891			 "running e2fsck is recommended");
1892	if (!sbi->s_journal)
1893		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1894	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1895		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1896	le16_add_cpu(&es->s_mnt_count, 1);
1897	es->s_mtime = cpu_to_le32(get_seconds());
1898	ext4_update_dynamic_rev(sb);
1899	if (sbi->s_journal)
1900		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1901
1902	ext4_commit_super(sb, 1);
1903done:
1904	if (test_opt(sb, DEBUG))
1905		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1906				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1907			sb->s_blocksize,
1908			sbi->s_groups_count,
1909			EXT4_BLOCKS_PER_GROUP(sb),
1910			EXT4_INODES_PER_GROUP(sb),
1911			sbi->s_mount_opt, sbi->s_mount_opt2);
1912
1913	cleancache_init_fs(sb);
1914	return res;
1915}
1916
1917int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1918{
1919	struct ext4_sb_info *sbi = EXT4_SB(sb);
1920	struct flex_groups *new_groups;
1921	int size;
1922
1923	if (!sbi->s_log_groups_per_flex)
1924		return 0;
1925
1926	size = ext4_flex_group(sbi, ngroup - 1) + 1;
1927	if (size <= sbi->s_flex_groups_allocated)
1928		return 0;
1929
1930	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
1931	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
1932	if (!new_groups) {
1933		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
1934			 size / (int) sizeof(struct flex_groups));
1935		return -ENOMEM;
1936	}
1937
1938	if (sbi->s_flex_groups) {
1939		memcpy(new_groups, sbi->s_flex_groups,
1940		       (sbi->s_flex_groups_allocated *
1941			sizeof(struct flex_groups)));
1942		ext4_kvfree(sbi->s_flex_groups);
1943	}
1944	sbi->s_flex_groups = new_groups;
1945	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
1946	return 0;
1947}
1948
1949static int ext4_fill_flex_info(struct super_block *sb)
1950{
1951	struct ext4_sb_info *sbi = EXT4_SB(sb);
1952	struct ext4_group_desc *gdp = NULL;
1953	ext4_group_t flex_group;
1954	int i, err;
1955
1956	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1957	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
1958		sbi->s_log_groups_per_flex = 0;
1959		return 1;
1960	}
1961
1962	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1963	if (err)
1964		goto failed;
1965
1966	for (i = 0; i < sbi->s_groups_count; i++) {
1967		gdp = ext4_get_group_desc(sb, i, NULL);
1968
1969		flex_group = ext4_flex_group(sbi, i);
1970		atomic_add(ext4_free_inodes_count(sb, gdp),
1971			   &sbi->s_flex_groups[flex_group].free_inodes);
1972		atomic64_add(ext4_free_group_clusters(sb, gdp),
1973			     &sbi->s_flex_groups[flex_group].free_clusters);
1974		atomic_add(ext4_used_dirs_count(sb, gdp),
1975			   &sbi->s_flex_groups[flex_group].used_dirs);
1976	}
1977
1978	return 1;
1979failed:
1980	return 0;
1981}
1982
1983static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1984				   struct ext4_group_desc *gdp)
1985{
1986	int offset;
1987	__u16 crc = 0;
1988	__le32 le_group = cpu_to_le32(block_group);
1989
1990	if (ext4_has_metadata_csum(sbi->s_sb)) {
1991		/* Use new metadata_csum algorithm */
1992		__le16 save_csum;
1993		__u32 csum32;
1994
1995		save_csum = gdp->bg_checksum;
1996		gdp->bg_checksum = 0;
1997		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
1998				     sizeof(le_group));
1999		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
2000				     sbi->s_desc_size);
2001		gdp->bg_checksum = save_csum;
2002
2003		crc = csum32 & 0xFFFF;
2004		goto out;
2005	}
2006
2007	/* old crc16 code */
2008	if (!(sbi->s_es->s_feature_ro_compat &
2009	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
2010		return 0;
2011
2012	offset = offsetof(struct ext4_group_desc, bg_checksum);
2013
2014	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2015	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2016	crc = crc16(crc, (__u8 *)gdp, offset);
2017	offset += sizeof(gdp->bg_checksum); /* skip checksum */
2018	/* for checksum of struct ext4_group_desc do the rest...*/
2019	if ((sbi->s_es->s_feature_incompat &
2020	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
2021	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
2022		crc = crc16(crc, (__u8 *)gdp + offset,
2023			    le16_to_cpu(sbi->s_es->s_desc_size) -
2024				offset);
2025
2026out:
2027	return cpu_to_le16(crc);
2028}
2029
2030int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2031				struct ext4_group_desc *gdp)
2032{
2033	if (ext4_has_group_desc_csum(sb) &&
2034	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
2035						      block_group, gdp)))
2036		return 0;
2037
2038	return 1;
2039}
2040
2041void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2042			      struct ext4_group_desc *gdp)
2043{
2044	if (!ext4_has_group_desc_csum(sb))
2045		return;
2046	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
2047}
2048
2049/* Called at mount-time, super-block is locked */
2050static int ext4_check_descriptors(struct super_block *sb,
2051				  ext4_group_t *first_not_zeroed)
2052{
2053	struct ext4_sb_info *sbi = EXT4_SB(sb);
2054	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2055	ext4_fsblk_t last_block;
2056	ext4_fsblk_t block_bitmap;
2057	ext4_fsblk_t inode_bitmap;
2058	ext4_fsblk_t inode_table;
2059	int flexbg_flag = 0;
2060	ext4_group_t i, grp = sbi->s_groups_count;
2061
2062	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2063		flexbg_flag = 1;
2064
2065	ext4_debug("Checking group descriptors");
2066
2067	for (i = 0; i < sbi->s_groups_count; i++) {
2068		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2069
2070		if (i == sbi->s_groups_count - 1 || flexbg_flag)
2071			last_block = ext4_blocks_count(sbi->s_es) - 1;
2072		else
2073			last_block = first_block +
2074				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
2075
2076		if ((grp == sbi->s_groups_count) &&
2077		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2078			grp = i;
2079
2080		block_bitmap = ext4_block_bitmap(sb, gdp);
2081		if (block_bitmap < first_block || block_bitmap > last_block) {
2082			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2083			       "Block bitmap for group %u not in group "
2084			       "(block %llu)!", i, block_bitmap);
2085			return 0;
2086		}
2087		inode_bitmap = ext4_inode_bitmap(sb, gdp);
2088		if (inode_bitmap < first_block || inode_bitmap > last_block) {
2089			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2090			       "Inode bitmap for group %u not in group "
2091			       "(block %llu)!", i, inode_bitmap);
2092			return 0;
2093		}
2094		inode_table = ext4_inode_table(sb, gdp);
2095		if (inode_table < first_block ||
2096		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2097			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2098			       "Inode table for group %u not in group "
2099			       "(block %llu)!", i, inode_table);
2100			return 0;
2101		}
2102		ext4_lock_group(sb, i);
2103		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2104			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2105				 "Checksum for group %u failed (%u!=%u)",
2106				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
2107				     gdp)), le16_to_cpu(gdp->bg_checksum));
2108			if (!(sb->s_flags & MS_RDONLY)) {
2109				ext4_unlock_group(sb, i);
2110				return 0;
2111			}
2112		}
2113		ext4_unlock_group(sb, i);
2114		if (!flexbg_flag)
2115			first_block += EXT4_BLOCKS_PER_GROUP(sb);
2116	}
2117	if (NULL != first_not_zeroed)
2118		*first_not_zeroed = grp;
2119	return 1;
2120}
2121
2122/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2123 * the superblock) which were deleted from all directories, but held open by
2124 * a process at the time of a crash.  We walk the list and try to delete these
2125 * inodes at recovery time (only with a read-write filesystem).
2126 *
2127 * In order to keep the orphan inode chain consistent during traversal (in
2128 * case of crash during recovery), we link each inode into the superblock
2129 * orphan list_head and handle it the same way as an inode deletion during
2130 * normal operation (which journals the operations for us).
2131 *
2132 * We only do an iget() and an iput() on each inode, which is very safe if we
2133 * accidentally point at an in-use or already deleted inode.  The worst that
2134 * can happen in this case is that we get a "bit already cleared" message from
2135 * ext4_free_inode().  The only reason we would point at a wrong inode is if
2136 * e2fsck was run on this filesystem, and it must have already done the orphan
2137 * inode cleanup for us, so we can safely abort without any further action.
2138 */
2139static void ext4_orphan_cleanup(struct super_block *sb,
2140				struct ext4_super_block *es)
2141{
2142	unsigned int s_flags = sb->s_flags;
2143	int nr_orphans = 0, nr_truncates = 0;
2144#ifdef CONFIG_QUOTA
2145	int i;
2146#endif
2147	if (!es->s_last_orphan) {
2148		jbd_debug(4, "no orphan inodes to clean up\n");
2149		return;
2150	}
2151
2152	if (bdev_read_only(sb->s_bdev)) {
2153		ext4_msg(sb, KERN_ERR, "write access "
2154			"unavailable, skipping orphan cleanup");
2155		return;
2156	}
2157
2158	/* Check if feature set would not allow a r/w mount */
2159	if (!ext4_feature_set_ok(sb, 0)) {
2160		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2161			 "unknown ROCOMPAT features");
2162		return;
2163	}
2164
2165	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2166		/* don't clear list on RO mount w/ errors */
2167		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
2168			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
2169				  "clearing orphan list.\n");
2170			es->s_last_orphan = 0;
2171		}
2172		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2173		return;
2174	}
2175
2176	if (s_flags & MS_RDONLY) {
2177		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
2178		sb->s_flags &= ~MS_RDONLY;
2179	}
2180#ifdef CONFIG_QUOTA
2181	/* Needed for iput() to work correctly and not trash data */
2182	sb->s_flags |= MS_ACTIVE;
2183	/* Turn on quotas so that they are updated correctly */
2184	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2185		if (EXT4_SB(sb)->s_qf_names[i]) {
2186			int ret = ext4_quota_on_mount(sb, i);
2187			if (ret < 0)
2188				ext4_msg(sb, KERN_ERR,
2189					"Cannot turn on journaled "
2190					"quota: error %d", ret);
2191		}
2192	}
2193#endif
2194
2195	while (es->s_last_orphan) {
2196		struct inode *inode;
2197
2198		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
2199		if (IS_ERR(inode)) {
2200			es->s_last_orphan = 0;
2201			break;
2202		}
2203
2204		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2205		dquot_initialize(inode);
2206		if (inode->i_nlink) {
2207			if (test_opt(sb, DEBUG))
2208				ext4_msg(sb, KERN_DEBUG,
2209					"%s: truncating inode %lu to %lld bytes",
2210					__func__, inode->i_ino, inode->i_size);
2211			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2212				  inode->i_ino, inode->i_size);
2213			mutex_lock(&inode->i_mutex);
2214			truncate_inode_pages(inode->i_mapping, inode->i_size);
2215			ext4_truncate(inode);
2216			mutex_unlock(&inode->i_mutex);
2217			nr_truncates++;
2218		} else {
2219			if (test_opt(sb, DEBUG))
2220				ext4_msg(sb, KERN_DEBUG,
2221					"%s: deleting unreferenced inode %lu",
2222					__func__, inode->i_ino);
2223			jbd_debug(2, "deleting unreferenced inode %lu\n",
2224				  inode->i_ino);
2225			nr_orphans++;
2226		}
2227		iput(inode);  /* The delete magic happens here! */
2228	}
2229
2230#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
2231
2232	if (nr_orphans)
2233		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
2234		       PLURAL(nr_orphans));
2235	if (nr_truncates)
2236		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
2237		       PLURAL(nr_truncates));
2238#ifdef CONFIG_QUOTA
2239	/* Turn quotas off */
2240	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2241		if (sb_dqopt(sb)->files[i])
2242			dquot_quota_off(sb, i);
2243	}
2244#endif
2245	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
2246}
2247
2248/*
2249 * Maximal extent format file size.
2250 * Resulting logical blkno at s_maxbytes must fit in our on-disk
2251 * extent format containers, within a sector_t, and within i_blocks
2252 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2253 * so that won't be a limiting factor.
2254 *
2255 * However there is other limiting factor. We do store extents in the form
2256 * of starting block and length, hence the resulting length of the extent
2257 * covering maximum file size must fit into on-disk format containers as
2258 * well. Given that length is always by 1 unit bigger than max unit (because
2259 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2260 *
2261 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2262 */
2263static loff_t ext4_max_size(int blkbits, int has_huge_files)
2264{
2265	loff_t res;
2266	loff_t upper_limit = MAX_LFS_FILESIZE;
2267
2268	/* small i_blocks in vfs inode? */
2269	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2270		/*
2271		 * CONFIG_LBDAF is not enabled implies the inode
2272		 * i_block represent total blocks in 512 bytes
2273		 * 32 == size of vfs inode i_blocks * 8
2274		 */
2275		upper_limit = (1LL << 32) - 1;
2276
2277		/* total blocks in file system block size */
2278		upper_limit >>= (blkbits - 9);
2279		upper_limit <<= blkbits;
2280	}
2281
2282	/*
2283	 * 32-bit extent-start container, ee_block. We lower the maxbytes
2284	 * by one fs block, so ee_len can cover the extent of maximum file
2285	 * size
2286	 */
2287	res = (1LL << 32) - 1;
2288	res <<= blkbits;
2289
2290	/* Sanity check against vm- & vfs- imposed limits */
2291	if (res > upper_limit)
2292		res = upper_limit;
2293
2294	return res;
2295}
2296
2297/*
2298 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
2299 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
2300 * We need to be 1 filesystem block less than the 2^48 sector limit.
2301 */
2302static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2303{
2304	loff_t res = EXT4_NDIR_BLOCKS;
2305	int meta_blocks;
2306	loff_t upper_limit;
2307	/* This is calculated to be the largest file size for a dense, block
2308	 * mapped file such that the file's total number of 512-byte sectors,
2309	 * including data and all indirect blocks, does not exceed (2^48 - 1).
2310	 *
2311	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
2312	 * number of 512-byte sectors of the file.
2313	 */
2314
2315	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2316		/*
2317		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
2318		 * the inode i_block field represents total file blocks in
2319		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
2320		 */
2321		upper_limit = (1LL << 32) - 1;
2322
2323		/* total blocks in file system block size */
2324		upper_limit >>= (bits - 9);
2325
2326	} else {
2327		/*
2328		 * We use 48 bit ext4_inode i_blocks
2329		 * With EXT4_HUGE_FILE_FL set the i_blocks
2330		 * represent total number of blocks in
2331		 * file system block size
2332		 */
2333		upper_limit = (1LL << 48) - 1;
2334
2335	}
2336
2337	/* indirect blocks */
2338	meta_blocks = 1;
2339	/* double indirect blocks */
2340	meta_blocks += 1 + (1LL << (bits-2));
2341	/* tripple indirect blocks */
2342	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
2343
2344	upper_limit -= meta_blocks;
2345	upper_limit <<= bits;
2346
2347	res += 1LL << (bits-2);
2348	res += 1LL << (2*(bits-2));
2349	res += 1LL << (3*(bits-2));
2350	res <<= bits;
2351	if (res > upper_limit)
2352		res = upper_limit;
2353
2354	if (res > MAX_LFS_FILESIZE)
2355		res = MAX_LFS_FILESIZE;
2356
2357	return res;
2358}
2359
2360static ext4_fsblk_t descriptor_loc(struct super_block *sb,
2361				   ext4_fsblk_t logical_sb_block, int nr)
2362{
2363	struct ext4_sb_info *sbi = EXT4_SB(sb);
2364	ext4_group_t bg, first_meta_bg;
2365	int has_super = 0;
2366
2367	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
2368
2369	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
2370	    nr < first_meta_bg)
2371		return logical_sb_block + nr + 1;
2372	bg = sbi->s_desc_per_block * nr;
2373	if (ext4_bg_has_super(sb, bg))
2374		has_super = 1;
2375
2376	/*
2377	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
2378	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
2379	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
2380	 * compensate.
2381	 */
2382	if (sb->s_blocksize == 1024 && nr == 0 &&
2383	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
2384		has_super++;
2385
2386	return (has_super + ext4_group_first_block_no(sb, bg));
2387}
2388
2389/**
2390 * ext4_get_stripe_size: Get the stripe size.
2391 * @sbi: In memory super block info
2392 *
2393 * If we have specified it via mount option, then
2394 * use the mount option value. If the value specified at mount time is
2395 * greater than the blocks per group use the super block value.
2396 * If the super block value is greater than blocks per group return 0.
2397 * Allocator needs it be less than blocks per group.
2398 *
2399 */
2400static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2401{
2402	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2403	unsigned long stripe_width =
2404			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2405	int ret;
2406
2407	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2408		ret = sbi->s_stripe;
2409	else if (stripe_width <= sbi->s_blocks_per_group)
2410		ret = stripe_width;
2411	else if (stride <= sbi->s_blocks_per_group)
2412		ret = stride;
2413	else
2414		ret = 0;
2415
2416	/*
2417	 * If the stripe width is 1, this makes no sense and
2418	 * we set it to 0 to turn off stripe handling code.
2419	 */
2420	if (ret <= 1)
2421		ret = 0;
2422
2423	return ret;
2424}
2425
2426/* sysfs supprt */
2427
2428struct ext4_attr {
2429	struct attribute attr;
2430	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2431	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2432			 const char *, size_t);
2433	union {
2434		int offset;
2435		int deprecated_val;
2436	} u;
2437};
2438
2439static int parse_strtoull(const char *buf,
2440		unsigned long long max, unsigned long long *value)
2441{
2442	int ret;
2443
2444	ret = kstrtoull(skip_spaces(buf), 0, value);
2445	if (!ret && *value > max)
2446		ret = -EINVAL;
2447	return ret;
2448}
2449
2450static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2451					      struct ext4_sb_info *sbi,
2452					      char *buf)
2453{
2454	return snprintf(buf, PAGE_SIZE, "%llu\n",
2455		(s64) EXT4_C2B(sbi,
2456			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
2457}
2458
2459static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2460					 struct ext4_sb_info *sbi, char *buf)
2461{
2462	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2463
2464	if (!sb->s_bdev->bd_part)
2465		return snprintf(buf, PAGE_SIZE, "0\n");
2466	return snprintf(buf, PAGE_SIZE, "%lu\n",
2467			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2468			 sbi->s_sectors_written_start) >> 1);
2469}
2470
2471static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2472					  struct ext4_sb_info *sbi, char *buf)
2473{
2474	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2475
2476	if (!sb->s_bdev->bd_part)
2477		return snprintf(buf, PAGE_SIZE, "0\n");
2478	return snprintf(buf, PAGE_SIZE, "%llu\n",
2479			(unsigned long long)(sbi->s_kbytes_written +
2480			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2481			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2482}
2483
2484static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2485					  struct ext4_sb_info *sbi,
2486					  const char *buf, size_t count)
2487{
2488	unsigned long t;
2489	int ret;
2490
2491	ret = kstrtoul(skip_spaces(buf), 0, &t);
2492	if (ret)
2493		return ret;
2494
2495	if (t && (!is_power_of_2(t) || t > 0x40000000))
2496		return -EINVAL;
2497
2498	sbi->s_inode_readahead_blks = t;
2499	return count;
2500}
2501
2502static ssize_t sbi_ui_show(struct ext4_attr *a,
2503			   struct ext4_sb_info *sbi, char *buf)
2504{
2505	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2506
2507	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2508}
2509
2510static ssize_t sbi_ui_store(struct ext4_attr *a,
2511			    struct ext4_sb_info *sbi,
2512			    const char *buf, size_t count)
2513{
2514	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2515	unsigned long t;
2516	int ret;
2517
2518	ret = kstrtoul(skip_spaces(buf), 0, &t);
2519	if (ret)
2520		return ret;
2521	*ui = t;
2522	return count;
2523}
2524
2525static ssize_t es_ui_show(struct ext4_attr *a,
2526			   struct ext4_sb_info *sbi, char *buf)
2527{
2528
2529	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
2530			   a->u.offset);
2531
2532	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2533}
2534
2535static ssize_t reserved_clusters_show(struct ext4_attr *a,
2536				  struct ext4_sb_info *sbi, char *buf)
2537{
2538	return snprintf(buf, PAGE_SIZE, "%llu\n",
2539		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
2540}
2541
2542static ssize_t reserved_clusters_store(struct ext4_attr *a,
2543				   struct ext4_sb_info *sbi,
2544				   const char *buf, size_t count)
2545{
2546	unsigned long long val;
2547	int ret;
2548
2549	if (parse_strtoull(buf, -1ULL, &val))
2550		return -EINVAL;
2551	ret = ext4_reserve_clusters(sbi, val);
2552
2553	return ret ? ret : count;
2554}
2555
2556static ssize_t trigger_test_error(struct ext4_attr *a,
2557				  struct ext4_sb_info *sbi,
2558				  const char *buf, size_t count)
2559{
2560	int len = count;
2561
2562	if (!capable(CAP_SYS_ADMIN))
2563		return -EPERM;
2564
2565	if (len && buf[len-1] == '\n')
2566		len--;
2567
2568	if (len)
2569		ext4_error(sbi->s_sb, "%.*s", len, buf);
2570	return count;
2571}
2572
2573static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2574				   struct ext4_sb_info *sbi, char *buf)
2575{
2576	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2577}
2578
2579#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2580static struct ext4_attr ext4_attr_##_name = {			\
2581	.attr = {.name = __stringify(_name), .mode = _mode },	\
2582	.show	= _show,					\
2583	.store	= _store,					\
2584	.u = {							\
2585		.offset = offsetof(struct ext4_sb_info, _elname),\
2586	},							\
2587}
2588
2589#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
2590static struct ext4_attr ext4_attr_##_name = {				\
2591	.attr = {.name = __stringify(_name), .mode = _mode },		\
2592	.show	= _show,						\
2593	.store	= _store,						\
2594	.u = {								\
2595		.offset = offsetof(struct ext4_super_block, _elname),	\
2596	},								\
2597}
2598
2599#define EXT4_ATTR(name, mode, show, store) \
2600static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2601
2602#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2603#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2604#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2605
2606#define EXT4_RO_ATTR_ES_UI(name, elname)	\
2607	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
2608#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
2609	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2610
2611#define ATTR_LIST(name) &ext4_attr_##name.attr
2612#define EXT4_DEPRECATED_ATTR(_name, _val)	\
2613static struct ext4_attr ext4_attr_##_name = {			\
2614	.attr = {.name = __stringify(_name), .mode = 0444 },	\
2615	.show	= sbi_deprecated_show,				\
2616	.u = {							\
2617		.deprecated_val = _val,				\
2618	},							\
2619}
2620
2621EXT4_RO_ATTR(delayed_allocation_blocks);
2622EXT4_RO_ATTR(session_write_kbytes);
2623EXT4_RO_ATTR(lifetime_write_kbytes);
2624EXT4_RW_ATTR(reserved_clusters);
2625EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2626		 inode_readahead_blks_store, s_inode_readahead_blks);
2627EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
2628EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2629EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2630EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2631EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2632EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2633EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2634EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2635EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2636EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2637EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
2638EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
2639EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
2640EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
2641EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
2642EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
2643EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
2644EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
2645EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
2646
2647static struct attribute *ext4_attrs[] = {
2648	ATTR_LIST(delayed_allocation_blocks),
2649	ATTR_LIST(session_write_kbytes),
2650	ATTR_LIST(lifetime_write_kbytes),
2651	ATTR_LIST(reserved_clusters),
2652	ATTR_LIST(inode_readahead_blks),
2653	ATTR_LIST(inode_goal),
2654	ATTR_LIST(mb_stats),
2655	ATTR_LIST(mb_max_to_scan),
2656	ATTR_LIST(mb_min_to_scan),
2657	ATTR_LIST(mb_order2_req),
2658	ATTR_LIST(mb_stream_req),
2659	ATTR_LIST(mb_group_prealloc),
2660	ATTR_LIST(max_writeback_mb_bump),
2661	ATTR_LIST(extent_max_zeroout_kb),
2662	ATTR_LIST(trigger_fs_error),
2663	ATTR_LIST(err_ratelimit_interval_ms),
2664	ATTR_LIST(err_ratelimit_burst),
2665	ATTR_LIST(warning_ratelimit_interval_ms),
2666	ATTR_LIST(warning_ratelimit_burst),
2667	ATTR_LIST(msg_ratelimit_interval_ms),
2668	ATTR_LIST(msg_ratelimit_burst),
2669	ATTR_LIST(errors_count),
2670	ATTR_LIST(first_error_time),
2671	ATTR_LIST(last_error_time),
2672	NULL,
2673};
2674
2675/* Features this copy of ext4 supports */
2676EXT4_INFO_ATTR(lazy_itable_init);
2677EXT4_INFO_ATTR(batched_discard);
2678EXT4_INFO_ATTR(meta_bg_resize);
2679
2680static struct attribute *ext4_feat_attrs[] = {
2681	ATTR_LIST(lazy_itable_init),
2682	ATTR_LIST(batched_discard),
2683	ATTR_LIST(meta_bg_resize),
2684	NULL,
2685};
2686
2687static ssize_t ext4_attr_show(struct kobject *kobj,
2688			      struct attribute *attr, char *buf)
2689{
2690	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2691						s_kobj);
2692	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2693
2694	return a->show ? a->show(a, sbi, buf) : 0;
2695}
2696
2697static ssize_t ext4_attr_store(struct kobject *kobj,
2698			       struct attribute *attr,
2699			       const char *buf, size_t len)
2700{
2701	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2702						s_kobj);
2703	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2704
2705	return a->store ? a->store(a, sbi, buf, len) : 0;
2706}
2707
2708static void ext4_sb_release(struct kobject *kobj)
2709{
2710	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2711						s_kobj);
2712	complete(&sbi->s_kobj_unregister);
2713}
2714
2715static const struct sysfs_ops ext4_attr_ops = {
2716	.show	= ext4_attr_show,
2717	.store	= ext4_attr_store,
2718};
2719
2720static struct kobj_type ext4_ktype = {
2721	.default_attrs	= ext4_attrs,
2722	.sysfs_ops	= &ext4_attr_ops,
2723	.release	= ext4_sb_release,
2724};
2725
2726static void ext4_feat_release(struct kobject *kobj)
2727{
2728	complete(&ext4_feat->f_kobj_unregister);
2729}
2730
2731static ssize_t ext4_feat_show(struct kobject *kobj,
2732			      struct attribute *attr, char *buf)
2733{
2734	return snprintf(buf, PAGE_SIZE, "supported\n");
2735}
2736
2737/*
2738 * We can not use ext4_attr_show/store because it relies on the kobject
2739 * being embedded in the ext4_sb_info structure which is definitely not
2740 * true in this case.
2741 */
2742static const struct sysfs_ops ext4_feat_ops = {
2743	.show	= ext4_feat_show,
2744	.store	= NULL,
2745};
2746
2747static struct kobj_type ext4_feat_ktype = {
2748	.default_attrs	= ext4_feat_attrs,
2749	.sysfs_ops	= &ext4_feat_ops,
2750	.release	= ext4_feat_release,
2751};
2752
2753/*
2754 * Check whether this filesystem can be mounted based on
2755 * the features present and the RDONLY/RDWR mount requested.
2756 * Returns 1 if this filesystem can be mounted as requested,
2757 * 0 if it cannot be.
2758 */
2759static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2760{
2761	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2762		ext4_msg(sb, KERN_ERR,
2763			"Couldn't mount because of "
2764			"unsupported optional features (%x)",
2765			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2766			~EXT4_FEATURE_INCOMPAT_SUPP));
2767		return 0;
2768	}
2769
2770	if (readonly)
2771		return 1;
2772
2773	/* Check that feature set is OK for a read-write mount */
2774	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2775		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2776			 "unsupported optional features (%x)",
2777			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2778				~EXT4_FEATURE_RO_COMPAT_SUPP));
2779		return 0;
2780	}
2781	/*
2782	 * Large file size enabled file system can only be mounted
2783	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2784	 */
2785	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2786		if (sizeof(blkcnt_t) < sizeof(u64)) {
2787			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2788				 "cannot be mounted RDWR without "
2789				 "CONFIG_LBDAF");
2790			return 0;
2791		}
2792	}
2793	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
2794	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2795		ext4_msg(sb, KERN_ERR,
2796			 "Can't support bigalloc feature without "
2797			 "extents feature\n");
2798		return 0;
2799	}
2800
2801#ifndef CONFIG_QUOTA
2802	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
2803	    !readonly) {
2804		ext4_msg(sb, KERN_ERR,
2805			 "Filesystem with quota feature cannot be mounted RDWR "
2806			 "without CONFIG_QUOTA");
2807		return 0;
2808	}
2809#endif  /* CONFIG_QUOTA */
2810	return 1;
2811}
2812
2813/*
2814 * This function is called once a day if we have errors logged
2815 * on the file system
2816 */
2817static void print_daily_error_info(unsigned long arg)
2818{
2819	struct super_block *sb = (struct super_block *) arg;
2820	struct ext4_sb_info *sbi;
2821	struct ext4_super_block *es;
2822
2823	sbi = EXT4_SB(sb);
2824	es = sbi->s_es;
2825
2826	if (es->s_error_count)
2827		/* fsck newer than v1.41.13 is needed to clean this condition. */
2828		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
2829			 le32_to_cpu(es->s_error_count));
2830	if (es->s_first_error_time) {
2831		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
2832		       sb->s_id, le32_to_cpu(es->s_first_error_time),
2833		       (int) sizeof(es->s_first_error_func),
2834		       es->s_first_error_func,
2835		       le32_to_cpu(es->s_first_error_line));
2836		if (es->s_first_error_ino)
2837			printk(": inode %u",
2838			       le32_to_cpu(es->s_first_error_ino));
2839		if (es->s_first_error_block)
2840			printk(": block %llu", (unsigned long long)
2841			       le64_to_cpu(es->s_first_error_block));
2842		printk("\n");
2843	}
2844	if (es->s_last_error_time) {
2845		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
2846		       sb->s_id, le32_to_cpu(es->s_last_error_time),
2847		       (int) sizeof(es->s_last_error_func),
2848		       es->s_last_error_func,
2849		       le32_to_cpu(es->s_last_error_line));
2850		if (es->s_last_error_ino)
2851			printk(": inode %u",
2852			       le32_to_cpu(es->s_last_error_ino));
2853		if (es->s_last_error_block)
2854			printk(": block %llu", (unsigned long long)
2855			       le64_to_cpu(es->s_last_error_block));
2856		printk("\n");
2857	}
2858	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
2859}
2860
2861/* Find next suitable group and run ext4_init_inode_table */
2862static int ext4_run_li_request(struct ext4_li_request *elr)
2863{
2864	struct ext4_group_desc *gdp = NULL;
2865	ext4_group_t group, ngroups;
2866	struct super_block *sb;
2867	unsigned long timeout = 0;
2868	int ret = 0;
2869
2870	sb = elr->lr_super;
2871	ngroups = EXT4_SB(sb)->s_groups_count;
2872
2873	sb_start_write(sb);
2874	for (group = elr->lr_next_group; group < ngroups; group++) {
2875		gdp = ext4_get_group_desc(sb, group, NULL);
2876		if (!gdp) {
2877			ret = 1;
2878			break;
2879		}
2880
2881		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2882			break;
2883	}
2884
2885	if (group >= ngroups)
2886		ret = 1;
2887
2888	if (!ret) {
2889		timeout = jiffies;
2890		ret = ext4_init_inode_table(sb, group,
2891					    elr->lr_timeout ? 0 : 1);
2892		if (elr->lr_timeout == 0) {
2893			timeout = (jiffies - timeout) *
2894				  elr->lr_sbi->s_li_wait_mult;
2895			elr->lr_timeout = timeout;
2896		}
2897		elr->lr_next_sched = jiffies + elr->lr_timeout;
2898		elr->lr_next_group = group + 1;
2899	}
2900	sb_end_write(sb);
2901
2902	return ret;
2903}
2904
2905/*
2906 * Remove lr_request from the list_request and free the
2907 * request structure. Should be called with li_list_mtx held
2908 */
2909static void ext4_remove_li_request(struct ext4_li_request *elr)
2910{
2911	struct ext4_sb_info *sbi;
2912
2913	if (!elr)
2914		return;
2915
2916	sbi = elr->lr_sbi;
2917
2918	list_del(&elr->lr_request);
2919	sbi->s_li_request = NULL;
2920	kfree(elr);
2921}
2922
2923static void ext4_unregister_li_request(struct super_block *sb)
2924{
2925	mutex_lock(&ext4_li_mtx);
2926	if (!ext4_li_info) {
2927		mutex_unlock(&ext4_li_mtx);
2928		return;
2929	}
2930
2931	mutex_lock(&ext4_li_info->li_list_mtx);
2932	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2933	mutex_unlock(&ext4_li_info->li_list_mtx);
2934	mutex_unlock(&ext4_li_mtx);
2935}
2936
2937static struct task_struct *ext4_lazyinit_task;
2938
2939/*
2940 * This is the function where ext4lazyinit thread lives. It walks
2941 * through the request list searching for next scheduled filesystem.
2942 * When such a fs is found, run the lazy initialization request
2943 * (ext4_rn_li_request) and keep track of the time spend in this
2944 * function. Based on that time we compute next schedule time of
2945 * the request. When walking through the list is complete, compute
2946 * next waking time and put itself into sleep.
2947 */
2948static int ext4_lazyinit_thread(void *arg)
2949{
2950	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2951	struct list_head *pos, *n;
2952	struct ext4_li_request *elr;
2953	unsigned long next_wakeup, cur;
2954
2955	BUG_ON(NULL == eli);
2956
2957cont_thread:
2958	while (true) {
2959		next_wakeup = MAX_JIFFY_OFFSET;
2960
2961		mutex_lock(&eli->li_list_mtx);
2962		if (list_empty(&eli->li_request_list)) {
2963			mutex_unlock(&eli->li_list_mtx);
2964			goto exit_thread;
2965		}
2966
2967		list_for_each_safe(pos, n, &eli->li_request_list) {
2968			elr = list_entry(pos, struct ext4_li_request,
2969					 lr_request);
2970
2971			if (time_after_eq(jiffies, elr->lr_next_sched)) {
2972				if (ext4_run_li_request(elr) != 0) {
2973					/* error, remove the lazy_init job */
2974					ext4_remove_li_request(elr);
2975					continue;
2976				}
2977			}
2978
2979			if (time_before(elr->lr_next_sched, next_wakeup))
2980				next_wakeup = elr->lr_next_sched;
2981		}
2982		mutex_unlock(&eli->li_list_mtx);
2983
2984		try_to_freeze();
2985
2986		cur = jiffies;
2987		if ((time_after_eq(cur, next_wakeup)) ||
2988		    (MAX_JIFFY_OFFSET == next_wakeup)) {
2989			cond_resched();
2990			continue;
2991		}
2992
2993		schedule_timeout_interruptible(next_wakeup - cur);
2994
2995		if (kthread_should_stop()) {
2996			ext4_clear_request_list();
2997			goto exit_thread;
2998		}
2999	}
3000
3001exit_thread:
3002	/*
3003	 * It looks like the request list is empty, but we need
3004	 * to check it under the li_list_mtx lock, to prevent any
3005	 * additions into it, and of course we should lock ext4_li_mtx
3006	 * to atomically free the list and ext4_li_info, because at
3007	 * this point another ext4 filesystem could be registering
3008	 * new one.
3009	 */
3010	mutex_lock(&ext4_li_mtx);
3011	mutex_lock(&eli->li_list_mtx);
3012	if (!list_empty(&eli->li_request_list)) {
3013		mutex_unlock(&eli->li_list_mtx);
3014		mutex_unlock(&ext4_li_mtx);
3015		goto cont_thread;
3016	}
3017	mutex_unlock(&eli->li_list_mtx);
3018	kfree(ext4_li_info);
3019	ext4_li_info = NULL;
3020	mutex_unlock(&ext4_li_mtx);
3021
3022	return 0;
3023}
3024
3025static void ext4_clear_request_list(void)
3026{
3027	struct list_head *pos, *n;
3028	struct ext4_li_request *elr;
3029
3030	mutex_lock(&ext4_li_info->li_list_mtx);
3031	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3032		elr = list_entry(pos, struct ext4_li_request,
3033				 lr_request);
3034		ext4_remove_li_request(elr);
3035	}
3036	mutex_unlock(&ext4_li_info->li_list_mtx);
3037}
3038
3039static int ext4_run_lazyinit_thread(void)
3040{
3041	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3042					 ext4_li_info, "ext4lazyinit");
3043	if (IS_ERR(ext4_lazyinit_task)) {
3044		int err = PTR_ERR(ext4_lazyinit_task);
3045		ext4_clear_request_list();
3046		kfree(ext4_li_info);
3047		ext4_li_info = NULL;
3048		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3049				 "initialization thread\n",
3050				 err);
3051		return err;
3052	}
3053	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3054	return 0;
3055}
3056
3057/*
3058 * Check whether it make sense to run itable init. thread or not.
3059 * If there is at least one uninitialized inode table, return
3060 * corresponding group number, else the loop goes through all
3061 * groups and return total number of groups.
3062 */
3063static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3064{
3065	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3066	struct ext4_group_desc *gdp = NULL;
3067
3068	for (group = 0; group < ngroups; group++) {
3069		gdp = ext4_get_group_desc(sb, group, NULL);
3070		if (!gdp)
3071			continue;
3072
3073		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3074			break;
3075	}
3076
3077	return group;
3078}
3079
3080static int ext4_li_info_new(void)
3081{
3082	struct ext4_lazy_init *eli = NULL;
3083
3084	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3085	if (!eli)
3086		return -ENOMEM;
3087
3088	INIT_LIST_HEAD(&eli->li_request_list);
3089	mutex_init(&eli->li_list_mtx);
3090
3091	eli->li_state |= EXT4_LAZYINIT_QUIT;
3092
3093	ext4_li_info = eli;
3094
3095	return 0;
3096}
3097
3098static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3099					    ext4_group_t start)
3100{
3101	struct ext4_sb_info *sbi = EXT4_SB(sb);
3102	struct ext4_li_request *elr;
3103
3104	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3105	if (!elr)
3106		return NULL;
3107
3108	elr->lr_super = sb;
3109	elr->lr_sbi = sbi;
3110	elr->lr_next_group = start;
3111
3112	/*
3113	 * Randomize first schedule time of the request to
3114	 * spread the inode table initialization requests
3115	 * better.
3116	 */
3117	elr->lr_next_sched = jiffies + (prandom_u32() %
3118				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
3119	return elr;
3120}
3121
3122int ext4_register_li_request(struct super_block *sb,
3123			     ext4_group_t first_not_zeroed)
3124{
3125	struct ext4_sb_info *sbi = EXT4_SB(sb);
3126	struct ext4_li_request *elr = NULL;
3127	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3128	int ret = 0;
3129
3130	mutex_lock(&ext4_li_mtx);
3131	if (sbi->s_li_request != NULL) {
3132		/*
3133		 * Reset timeout so it can be computed again, because
3134		 * s_li_wait_mult might have changed.
3135		 */
3136		sbi->s_li_request->lr_timeout = 0;
3137		goto out;
3138	}
3139
3140	if (first_not_zeroed == ngroups ||
3141	    (sb->s_flags & MS_RDONLY) ||
3142	    !test_opt(sb, INIT_INODE_TABLE))
3143		goto out;
3144
3145	elr = ext4_li_request_new(sb, first_not_zeroed);
3146	if (!elr) {
3147		ret = -ENOMEM;
3148		goto out;
3149	}
3150
3151	if (NULL == ext4_li_info) {
3152		ret = ext4_li_info_new();
3153		if (ret)
3154			goto out;
3155	}
3156
3157	mutex_lock(&ext4_li_info->li_list_mtx);
3158	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3159	mutex_unlock(&ext4_li_info->li_list_mtx);
3160
3161	sbi->s_li_request = elr;
3162	/*
3163	 * set elr to NULL here since it has been inserted to
3164	 * the request_list and the removal and free of it is
3165	 * handled by ext4_clear_request_list from now on.
3166	 */
3167	elr = NULL;
3168
3169	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3170		ret = ext4_run_lazyinit_thread();
3171		if (ret)
3172			goto out;
3173	}
3174out:
3175	mutex_unlock(&ext4_li_mtx);
3176	if (ret)
3177		kfree(elr);
3178	return ret;
3179}
3180
3181/*
3182 * We do not need to lock anything since this is called on
3183 * module unload.
3184 */
3185static void ext4_destroy_lazyinit_thread(void)
3186{
3187	/*
3188	 * If thread exited earlier
3189	 * there's nothing to be done.
3190	 */
3191	if (!ext4_li_info || !ext4_lazyinit_task)
3192		return;
3193
3194	kthread_stop(ext4_lazyinit_task);
3195}
3196
3197static int set_journal_csum_feature_set(struct super_block *sb)
3198{
3199	int ret = 1;
3200	int compat, incompat;
3201	struct ext4_sb_info *sbi = EXT4_SB(sb);
3202
3203	if (ext4_has_metadata_csum(sb)) {
3204		/* journal checksum v3 */
3205		compat = 0;
3206		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3207	} else {
3208		/* journal checksum v1 */
3209		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3210		incompat = 0;
3211	}
3212
3213	jbd2_journal_clear_features(sbi->s_journal,
3214			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3215			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3216			JBD2_FEATURE_INCOMPAT_CSUM_V2);
3217	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3218		ret = jbd2_journal_set_features(sbi->s_journal,
3219				compat, 0,
3220				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3221				incompat);
3222	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3223		ret = jbd2_journal_set_features(sbi->s_journal,
3224				compat, 0,
3225				incompat);
3226		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3227				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3228	} else {
3229		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3230				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3231	}
3232
3233	return ret;
3234}
3235
3236/*
3237 * Note: calculating the overhead so we can be compatible with
3238 * historical BSD practice is quite difficult in the face of
3239 * clusters/bigalloc.  This is because multiple metadata blocks from
3240 * different block group can end up in the same allocation cluster.
3241 * Calculating the exact overhead in the face of clustered allocation
3242 * requires either O(all block bitmaps) in memory or O(number of block
3243 * groups**2) in time.  We will still calculate the superblock for
3244 * older file systems --- and if we come across with a bigalloc file
3245 * system with zero in s_overhead_clusters the estimate will be close to
3246 * correct especially for very large cluster sizes --- but for newer
3247 * file systems, it's better to calculate this figure once at mkfs
3248 * time, and store it in the superblock.  If the superblock value is
3249 * present (even for non-bigalloc file systems), we will use it.
3250 */
3251static int count_overhead(struct super_block *sb, ext4_group_t grp,
3252			  char *buf)
3253{
3254	struct ext4_sb_info	*sbi = EXT4_SB(sb);
3255	struct ext4_group_desc	*gdp;
3256	ext4_fsblk_t		first_block, last_block, b;
3257	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
3258	int			s, j, count = 0;
3259
3260	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
3261		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3262			sbi->s_itb_per_group + 2);
3263
3264	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3265		(grp * EXT4_BLOCKS_PER_GROUP(sb));
3266	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3267	for (i = 0; i < ngroups; i++) {
3268		gdp = ext4_get_group_desc(sb, i, NULL);
3269		b = ext4_block_bitmap(sb, gdp);
3270		if (b >= first_block && b <= last_block) {
3271			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3272			count++;
3273		}
3274		b = ext4_inode_bitmap(sb, gdp);
3275		if (b >= first_block && b <= last_block) {
3276			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3277			count++;
3278		}
3279		b = ext4_inode_table(sb, gdp);
3280		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3281			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3282				int c = EXT4_B2C(sbi, b - first_block);
3283				ext4_set_bit(c, buf);
3284				count++;
3285			}
3286		if (i != grp)
3287			continue;
3288		s = 0;
3289		if (ext4_bg_has_super(sb, grp)) {
3290			ext4_set_bit(s++, buf);
3291			count++;
3292		}
3293		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
3294			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3295			count++;
3296		}
3297	}
3298	if (!count)
3299		return 0;
3300	return EXT4_CLUSTERS_PER_GROUP(sb) -
3301		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3302}
3303
3304/*
3305 * Compute the overhead and stash it in sbi->s_overhead
3306 */
3307int ext4_calculate_overhead(struct super_block *sb)
3308{
3309	struct ext4_sb_info *sbi = EXT4_SB(sb);
3310	struct ext4_super_block *es = sbi->s_es;
3311	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3312	ext4_fsblk_t overhead = 0;
3313	char *buf = (char *) get_zeroed_page(GFP_KERNEL);
3314
3315	if (!buf)
3316		return -ENOMEM;
3317
3318	/*
3319	 * Compute the overhead (FS structures).  This is constant
3320	 * for a given filesystem unless the number of block groups
3321	 * changes so we cache the previous value until it does.
3322	 */
3323
3324	/*
3325	 * All of the blocks before first_data_block are overhead
3326	 */
3327	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3328
3329	/*
3330	 * Add the overhead found in each block group
3331	 */
3332	for (i = 0; i < ngroups; i++) {
3333		int blks;
3334
3335		blks = count_overhead(sb, i, buf);
3336		overhead += blks;
3337		if (blks)
3338			memset(buf, 0, PAGE_SIZE);
3339		cond_resched();
3340	}
3341	/* Add the journal blocks as well */
3342	if (sbi->s_journal)
3343		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3344
3345	sbi->s_overhead = overhead;
3346	smp_wmb();
3347	free_page((unsigned long) buf);
3348	return 0;
3349}
3350
3351
3352static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
3353{
3354	ext4_fsblk_t resv_clusters;
3355
3356	/*
3357	 * There's no need to reserve anything when we aren't using extents.
3358	 * The space estimates are exact, there are no unwritten extents,
3359	 * hole punching doesn't need new metadata... This is needed especially
3360	 * to keep ext2/3 backward compatibility.
3361	 */
3362	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3363		return 0;
3364	/*
3365	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3366	 * This should cover the situations where we can not afford to run
3367	 * out of space like for example punch hole, or converting
3368	 * unwritten extents in delalloc path. In most cases such
3369	 * allocation would require 1, or 2 blocks, higher numbers are
3370	 * very rare.
3371	 */
3372	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
3373			EXT4_SB(sb)->s_cluster_bits;
3374
3375	do_div(resv_clusters, 50);
3376	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3377
3378	return resv_clusters;
3379}
3380
3381
3382static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
3383{
3384	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
3385				sbi->s_cluster_bits;
3386
3387	if (count >= clusters)
3388		return -EINVAL;
3389
3390	atomic64_set(&sbi->s_resv_clusters, count);
3391	return 0;
3392}
3393
3394static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3395{
3396	char *orig_data = kstrdup(data, GFP_KERNEL);
3397	struct buffer_head *bh;
3398	struct ext4_super_block *es = NULL;
3399	struct ext4_sb_info *sbi;
3400	ext4_fsblk_t block;
3401	ext4_fsblk_t sb_block = get_sb_block(&data);
3402	ext4_fsblk_t logical_sb_block;
3403	unsigned long offset = 0;
3404	unsigned long journal_devnum = 0;
3405	unsigned long def_mount_opts;
3406	struct inode *root;
3407	char *cp;
3408	const char *descr;
3409	int ret = -ENOMEM;
3410	int blocksize, clustersize;
3411	unsigned int db_count;
3412	unsigned int i;
3413	int needs_recovery, has_huge_files, has_bigalloc;
3414	__u64 blocks_count;
3415	int err = 0;
3416	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3417	ext4_group_t first_not_zeroed;
3418
3419	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3420	if (!sbi)
3421		goto out_free_orig;
3422
3423	sbi->s_blockgroup_lock =
3424		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3425	if (!sbi->s_blockgroup_lock) {
3426		kfree(sbi);
3427		goto out_free_orig;
3428	}
3429	sb->s_fs_info = sbi;
3430	sbi->s_sb = sb;
3431	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3432	sbi->s_sb_block = sb_block;
3433	if (sb->s_bdev->bd_part)
3434		sbi->s_sectors_written_start =
3435			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
3436
3437	/* Cleanup superblock name */
3438	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
3439		*cp = '!';
3440
3441	/* -EINVAL is default */
3442	ret = -EINVAL;
3443	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3444	if (!blocksize) {
3445		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3446		goto out_fail;
3447	}
3448
3449	/*
3450	 * The ext4 superblock will not be buffer aligned for other than 1kB
3451	 * block sizes.  We need to calculate the offset from buffer start.
3452	 */
3453	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3454		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3455		offset = do_div(logical_sb_block, blocksize);
3456	} else {
3457		logical_sb_block = sb_block;
3458	}
3459
3460	if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
3461		ext4_msg(sb, KERN_ERR, "unable to read superblock");
3462		goto out_fail;
3463	}
3464	/*
3465	 * Note: s_es must be initialized as soon as possible because
3466	 *       some ext4 macro-instructions depend on its value
3467	 */
3468	es = (struct ext4_super_block *) (bh->b_data + offset);
3469	sbi->s_es = es;
3470	sb->s_magic = le16_to_cpu(es->s_magic);
3471	if (sb->s_magic != EXT4_SUPER_MAGIC)
3472		goto cantfind_ext4;
3473	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3474
3475	/* Warn if metadata_csum and gdt_csum are both set. */
3476	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3477				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
3478	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3479		ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
3480			     "redundant flags; please run fsck.");
3481
3482	/* Check for a known checksum algorithm */
3483	if (!ext4_verify_csum_type(sb, es)) {
3484		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3485			 "unknown checksum algorithm.");
3486		silent = 1;
3487		goto cantfind_ext4;
3488	}
3489
3490	/* Load the checksum driver */
3491	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3492				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3493		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3494		if (IS_ERR(sbi->s_chksum_driver)) {
3495			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3496			ret = PTR_ERR(sbi->s_chksum_driver);
3497			sbi->s_chksum_driver = NULL;
3498			goto failed_mount;
3499		}
3500	}
3501
3502	/* Check superblock checksum */
3503	if (!ext4_superblock_csum_verify(sb, es)) {
3504		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3505			 "invalid superblock checksum.  Run e2fsck?");
3506		silent = 1;
3507		goto cantfind_ext4;
3508	}
3509
3510	/* Precompute checksum seed for all metadata */
3511	if (ext4_has_metadata_csum(sb))
3512		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3513					       sizeof(es->s_uuid));
3514
3515	/* Set defaults before we parse the mount options */
3516	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3517	set_opt(sb, INIT_INODE_TABLE);
3518	if (def_mount_opts & EXT4_DEFM_DEBUG)
3519		set_opt(sb, DEBUG);
3520	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3521		set_opt(sb, GRPID);
3522	if (def_mount_opts & EXT4_DEFM_UID16)
3523		set_opt(sb, NO_UID32);
3524	/* xattr user namespace & acls are now defaulted on */
3525	set_opt(sb, XATTR_USER);
3526#ifdef CONFIG_EXT4_FS_POSIX_ACL
3527	set_opt(sb, POSIX_ACL);
3528#endif
3529	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3530		set_opt(sb, JOURNAL_DATA);
3531	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3532		set_opt(sb, ORDERED_DATA);
3533	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3534		set_opt(sb, WRITEBACK_DATA);
3535
3536	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3537		set_opt(sb, ERRORS_PANIC);
3538	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3539		set_opt(sb, ERRORS_CONT);
3540	else
3541		set_opt(sb, ERRORS_RO);
3542	/* block_validity enabled by default; disable with noblock_validity */
3543	set_opt(sb, BLOCK_VALIDITY);
3544	if (def_mount_opts & EXT4_DEFM_DISCARD)
3545		set_opt(sb, DISCARD);
3546
3547	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
3548	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
3549	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
3550	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
3551	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3552
3553	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3554		set_opt(sb, BARRIER);
3555
3556	/*
3557	 * enable delayed allocation by default
3558	 * Use -o nodelalloc to turn it off
3559	 */
3560	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3561	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3562		set_opt(sb, DELALLOC);
3563
3564	/*
3565	 * set default s_li_wait_mult for lazyinit, for the case there is
3566	 * no mount option specified.
3567	 */
3568	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3569
3570	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3571			   &journal_devnum, &journal_ioprio, 0)) {
3572		ext4_msg(sb, KERN_WARNING,
3573			 "failed to parse options in superblock: %s",
3574			 sbi->s_es->s_mount_opts);
3575	}
3576	sbi->s_def_mount_opt = sbi->s_mount_opt;
3577	if (!parse_options((char *) data, sb, &journal_devnum,
3578			   &journal_ioprio, 0))
3579		goto failed_mount;
3580
3581	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3582		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
3583			    "with data=journal disables delayed "
3584			    "allocation and O_DIRECT support!\n");
3585		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
3586			ext4_msg(sb, KERN_ERR, "can't mount with "
3587				 "both data=journal and delalloc");
3588			goto failed_mount;
3589		}
3590		if (test_opt(sb, DIOREAD_NOLOCK)) {
3591			ext4_msg(sb, KERN_ERR, "can't mount with "
3592				 "both data=journal and dioread_nolock");
3593			goto failed_mount;
3594		}
3595		if (test_opt(sb, DELALLOC))
3596			clear_opt(sb, DELALLOC);
3597	}
3598
3599	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3600		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3601
3602	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
3603	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
3604	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
3605	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
3606		ext4_msg(sb, KERN_WARNING,
3607		       "feature flags set on rev 0 fs, "
3608		       "running e2fsck is recommended");
3609
3610	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3611		set_opt2(sb, HURD_COMPAT);
3612		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
3613					      EXT4_FEATURE_INCOMPAT_64BIT)) {
3614			ext4_msg(sb, KERN_ERR,
3615				 "The Hurd can't support 64-bit file systems");
3616			goto failed_mount;
3617		}
3618	}
3619
3620	if (IS_EXT2_SB(sb)) {
3621		if (ext2_feature_set_ok(sb))
3622			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3623				 "using the ext4 subsystem");
3624		else {
3625			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3626				 "to feature incompatibilities");
3627			goto failed_mount;
3628		}
3629	}
3630
3631	if (IS_EXT3_SB(sb)) {
3632		if (ext3_feature_set_ok(sb))
3633			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3634				 "using the ext4 subsystem");
3635		else {
3636			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3637				 "to feature incompatibilities");
3638			goto failed_mount;
3639		}
3640	}
3641
3642	/*
3643	 * Check feature flags regardless of the revision level, since we
3644	 * previously didn't change the revision level when setting the flags,
3645	 * so there is a chance incompat flags are set on a rev 0 filesystem.
3646	 */
3647	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3648		goto failed_mount;
3649
3650	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3651	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3652	    blocksize > EXT4_MAX_BLOCK_SIZE) {
3653		ext4_msg(sb, KERN_ERR,
3654		       "Unsupported filesystem blocksize %d", blocksize);
3655		goto failed_mount;
3656	}
3657
3658	if (sb->s_blocksize != blocksize) {
3659		/* Validate the filesystem blocksize */
3660		if (!sb_set_blocksize(sb, blocksize)) {
3661			ext4_msg(sb, KERN_ERR, "bad block size %d",
3662					blocksize);
3663			goto failed_mount;
3664		}
3665
3666		brelse(bh);
3667		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3668		offset = do_div(logical_sb_block, blocksize);
3669		bh = sb_bread_unmovable(sb, logical_sb_block);
3670		if (!bh) {
3671			ext4_msg(sb, KERN_ERR,
3672			       "Can't read superblock on 2nd try");
3673			goto failed_mount;
3674		}
3675		es = (struct ext4_super_block *)(bh->b_data + offset);
3676		sbi->s_es = es;
3677		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
3678			ext4_msg(sb, KERN_ERR,
3679			       "Magic mismatch, very weird!");
3680			goto failed_mount;
3681		}
3682	}
3683
3684	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3685				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3686	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
3687						      has_huge_files);
3688	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
3689
3690	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
3691		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
3692		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
3693	} else {
3694		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
3695		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
3696		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
3697		    (!is_power_of_2(sbi->s_inode_size)) ||
3698		    (sbi->s_inode_size > blocksize)) {
3699			ext4_msg(sb, KERN_ERR,
3700			       "unsupported inode size: %d",
3701			       sbi->s_inode_size);
3702			goto failed_mount;
3703		}
3704		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
3705			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
3706	}
3707
3708	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
3709	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
3710		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
3711		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
3712		    !is_power_of_2(sbi->s_desc_size)) {
3713			ext4_msg(sb, KERN_ERR,
3714			       "unsupported descriptor size %lu",
3715			       sbi->s_desc_size);
3716			goto failed_mount;
3717		}
3718	} else
3719		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
3720
3721	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
3722	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
3723	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
3724		goto cantfind_ext4;
3725
3726	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
3727	if (sbi->s_inodes_per_block == 0)
3728		goto cantfind_ext4;
3729	sbi->s_itb_per_group = sbi->s_inodes_per_group /
3730					sbi->s_inodes_per_block;
3731	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
3732	sbi->s_sbh = bh;
3733	sbi->s_mount_state = le16_to_cpu(es->s_state);
3734	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3735	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3736
3737	for (i = 0; i < 4; i++)
3738		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3739	sbi->s_def_hash_version = es->s_def_hash_version;
3740	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
3741		i = le32_to_cpu(es->s_flags);
3742		if (i & EXT2_FLAGS_UNSIGNED_HASH)
3743			sbi->s_hash_unsigned = 3;
3744		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
3745#ifdef __CHAR_UNSIGNED__
3746			if (!(sb->s_flags & MS_RDONLY))
3747				es->s_flags |=
3748					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
3749			sbi->s_hash_unsigned = 3;
3750#else
3751			if (!(sb->s_flags & MS_RDONLY))
3752				es->s_flags |=
3753					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3754#endif
3755		}
3756	}
3757
3758	/* Handle clustersize */
3759	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
3760	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3761				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
3762	if (has_bigalloc) {
3763		if (clustersize < blocksize) {
3764			ext4_msg(sb, KERN_ERR,
3765				 "cluster size (%d) smaller than "
3766				 "block size (%d)", clustersize, blocksize);
3767			goto failed_mount;
3768		}
3769		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
3770			le32_to_cpu(es->s_log_block_size);
3771		sbi->s_clusters_per_group =
3772			le32_to_cpu(es->s_clusters_per_group);
3773		if (sbi->s_clusters_per_group > blocksize * 8) {
3774			ext4_msg(sb, KERN_ERR,
3775				 "#clusters per group too big: %lu",
3776				 sbi->s_clusters_per_group);
3777			goto failed_mount;
3778		}
3779		if (sbi->s_blocks_per_group !=
3780		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
3781			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
3782				 "clusters per group (%lu) inconsistent",
3783				 sbi->s_blocks_per_group,
3784				 sbi->s_clusters_per_group);
3785			goto failed_mount;
3786		}
3787	} else {
3788		if (clustersize != blocksize) {
3789			ext4_warning(sb, "fragment/cluster size (%d) != "
3790				     "block size (%d)", clustersize,
3791				     blocksize);
3792			clustersize = blocksize;
3793		}
3794		if (sbi->s_blocks_per_group > blocksize * 8) {
3795			ext4_msg(sb, KERN_ERR,
3796				 "#blocks per group too big: %lu",
3797				 sbi->s_blocks_per_group);
3798			goto failed_mount;
3799		}
3800		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
3801		sbi->s_cluster_bits = 0;
3802	}
3803	sbi->s_cluster_ratio = clustersize / blocksize;
3804
3805	if (sbi->s_inodes_per_group > blocksize * 8) {
3806		ext4_msg(sb, KERN_ERR,
3807		       "#inodes per group too big: %lu",
3808		       sbi->s_inodes_per_group);
3809		goto failed_mount;
3810	}
3811
3812	/* Do we have standard group size of clustersize * 8 blocks ? */
3813	if (sbi->s_blocks_per_group == clustersize << 3)
3814		set_opt2(sb, STD_GROUP_SIZE);
3815
3816	/*
3817	 * Test whether we have more sectors than will fit in sector_t,
3818	 * and whether the max offset is addressable by the page cache.
3819	 */
3820	err = generic_check_addressable(sb->s_blocksize_bits,
3821					ext4_blocks_count(es));
3822	if (err) {
3823		ext4_msg(sb, KERN_ERR, "filesystem"
3824			 " too large to mount safely on this system");
3825		if (sizeof(sector_t) < 8)
3826			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3827		goto failed_mount;
3828	}
3829
3830	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
3831		goto cantfind_ext4;
3832
3833	/* check blocks count against device size */
3834	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
3835	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
3836		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
3837		       "exceeds size of device (%llu blocks)",
3838		       ext4_blocks_count(es), blocks_count);
3839		goto failed_mount;
3840	}
3841
3842	/*
3843	 * It makes no sense for the first data block to be beyond the end
3844	 * of the filesystem.
3845	 */
3846	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
3847		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
3848			 "block %u is beyond end of filesystem (%llu)",
3849			 le32_to_cpu(es->s_first_data_block),
3850			 ext4_blocks_count(es));
3851		goto failed_mount;
3852	}
3853	blocks_count = (ext4_blocks_count(es) -
3854			le32_to_cpu(es->s_first_data_block) +
3855			EXT4_BLOCKS_PER_GROUP(sb) - 1);
3856	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
3857	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
3858		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
3859		       "(block count %llu, first data block %u, "
3860		       "blocks per group %lu)", sbi->s_groups_count,
3861		       ext4_blocks_count(es),
3862		       le32_to_cpu(es->s_first_data_block),
3863		       EXT4_BLOCKS_PER_GROUP(sb));
3864		goto failed_mount;
3865	}
3866	sbi->s_groups_count = blocks_count;
3867	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
3868			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
3869	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
3870		   EXT4_DESC_PER_BLOCK(sb);
3871	sbi->s_group_desc = ext4_kvmalloc(db_count *
3872					  sizeof(struct buffer_head *),
3873					  GFP_KERNEL);
3874	if (sbi->s_group_desc == NULL) {
3875		ext4_msg(sb, KERN_ERR, "not enough memory");
3876		ret = -ENOMEM;
3877		goto failed_mount;
3878	}
3879
3880	if (ext4_proc_root)
3881		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3882
3883	if (sbi->s_proc)
3884		proc_create_data("options", S_IRUGO, sbi->s_proc,
3885				 &ext4_seq_options_fops, sb);
3886
3887	bgl_lock_init(sbi->s_blockgroup_lock);
3888
3889	for (i = 0; i < db_count; i++) {
3890		block = descriptor_loc(sb, logical_sb_block, i);
3891		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
3892		if (!sbi->s_group_desc[i]) {
3893			ext4_msg(sb, KERN_ERR,
3894			       "can't read group descriptor %d", i);
3895			db_count = i;
3896			goto failed_mount2;
3897		}
3898	}
3899	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
3900		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3901		goto failed_mount2;
3902	}
3903
3904	sbi->s_gdb_count = db_count;
3905	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3906	spin_lock_init(&sbi->s_next_gen_lock);
3907
3908	init_timer(&sbi->s_err_report);
3909	sbi->s_err_report.function = print_daily_error_info;
3910	sbi->s_err_report.data = (unsigned long) sb;
3911
3912	/* Register extent status tree shrinker */
3913	if (ext4_es_register_shrinker(sbi))
3914		goto failed_mount3;
3915
3916	sbi->s_stripe = ext4_get_stripe_size(sbi);
3917	sbi->s_extent_max_zeroout_kb = 32;
3918
3919	/*
3920	 * set up enough so that it can read an inode
3921	 */
3922	sb->s_op = &ext4_sops;
3923	sb->s_export_op = &ext4_export_ops;
3924	sb->s_xattr = ext4_xattr_handlers;
3925#ifdef CONFIG_QUOTA
3926	sb->dq_op = &ext4_quota_operations;
3927	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
3928		sb->s_qcop = &ext4_qctl_sysfile_operations;
3929	else
3930		sb->s_qcop = &ext4_qctl_operations;
3931#endif
3932	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3933
3934	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3935	mutex_init(&sbi->s_orphan_lock);
3936
3937	sb->s_root = NULL;
3938
3939	needs_recovery = (es->s_last_orphan != 0 ||
3940			  EXT4_HAS_INCOMPAT_FEATURE(sb,
3941				    EXT4_FEATURE_INCOMPAT_RECOVER));
3942
3943	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3944	    !(sb->s_flags & MS_RDONLY))
3945		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3946			goto failed_mount3;
3947
3948	/*
3949	 * The first inode we look at is the journal inode.  Don't try
3950	 * root first: it may be modified in the journal!
3951	 */
3952	if (!test_opt(sb, NOLOAD) &&
3953	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
3954		if (ext4_load_journal(sb, es, journal_devnum))
3955			goto failed_mount3;
3956	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
3957	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3958		ext4_msg(sb, KERN_ERR, "required journal recovery "
3959		       "suppressed and not mounted read-only");
3960		goto failed_mount_wq;
3961	} else {
3962		clear_opt(sb, DATA_FLAGS);
3963		sbi->s_journal = NULL;
3964		needs_recovery = 0;
3965		goto no_journal;
3966	}
3967
3968	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
3969	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
3970				       JBD2_FEATURE_INCOMPAT_64BIT)) {
3971		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
3972		goto failed_mount_wq;
3973	}
3974
3975	if (!set_journal_csum_feature_set(sb)) {
3976		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
3977			 "feature set");
3978		goto failed_mount_wq;
3979	}
3980
3981	/* We have now updated the journal if required, so we can
3982	 * validate the data journaling mode. */
3983	switch (test_opt(sb, DATA_FLAGS)) {
3984	case 0:
3985		/* No mode set, assume a default based on the journal
3986		 * capabilities: ORDERED_DATA if the journal can
3987		 * cope, else JOURNAL_DATA
3988		 */
3989		if (jbd2_journal_check_available_features
3990		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3991			set_opt(sb, ORDERED_DATA);
3992		else
3993			set_opt(sb, JOURNAL_DATA);
3994		break;
3995
3996	case EXT4_MOUNT_ORDERED_DATA:
3997	case EXT4_MOUNT_WRITEBACK_DATA:
3998		if (!jbd2_journal_check_available_features
3999		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4000			ext4_msg(sb, KERN_ERR, "Journal does not support "
4001			       "requested data journaling mode");
4002			goto failed_mount_wq;
4003		}
4004	default:
4005		break;
4006	}
4007	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4008
4009	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
4010
4011no_journal:
4012	if (ext4_mballoc_ready) {
4013		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
4014		if (!sbi->s_mb_cache) {
4015			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
4016			goto failed_mount_wq;
4017		}
4018	}
4019
4020	/*
4021	 * Get the # of file system overhead blocks from the
4022	 * superblock if present.
4023	 */
4024	if (es->s_overhead_clusters)
4025		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4026	else {
4027		err = ext4_calculate_overhead(sb);
4028		if (err)
4029			goto failed_mount_wq;
4030	}
4031
4032	/*
4033	 * The maximum number of concurrent works can be high and
4034	 * concurrency isn't really necessary.  Limit it to 1.
4035	 */
4036	EXT4_SB(sb)->rsv_conversion_wq =
4037		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4038	if (!EXT4_SB(sb)->rsv_conversion_wq) {
4039		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4040		ret = -ENOMEM;
4041		goto failed_mount4;
4042	}
4043
4044	/*
4045	 * The jbd2_journal_load will have done any necessary log recovery,
4046	 * so we can safely mount the rest of the filesystem now.
4047	 */
4048
4049	root = ext4_iget(sb, EXT4_ROOT_INO);
4050	if (IS_ERR(root)) {
4051		ext4_msg(sb, KERN_ERR, "get root inode failed");
4052		ret = PTR_ERR(root);
4053		root = NULL;
4054		goto failed_mount4;
4055	}
4056	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4057		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4058		iput(root);
4059		goto failed_mount4;
4060	}
4061	sb->s_root = d_make_root(root);
4062	if (!sb->s_root) {
4063		ext4_msg(sb, KERN_ERR, "get root dentry failed");
4064		ret = -ENOMEM;
4065		goto failed_mount4;
4066	}
4067
4068	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
4069		sb->s_flags |= MS_RDONLY;
4070
4071	/* determine the minimum size of new large inodes, if present */
4072	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4073		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4074						     EXT4_GOOD_OLD_INODE_SIZE;
4075		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4076				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
4077			if (sbi->s_want_extra_isize <
4078			    le16_to_cpu(es->s_want_extra_isize))
4079				sbi->s_want_extra_isize =
4080					le16_to_cpu(es->s_want_extra_isize);
4081			if (sbi->s_want_extra_isize <
4082			    le16_to_cpu(es->s_min_extra_isize))
4083				sbi->s_want_extra_isize =
4084					le16_to_cpu(es->s_min_extra_isize);
4085		}
4086	}
4087	/* Check if enough inode space is available */
4088	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
4089							sbi->s_inode_size) {
4090		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4091						       EXT4_GOOD_OLD_INODE_SIZE;
4092		ext4_msg(sb, KERN_INFO, "required extra inode space not"
4093			 "available");
4094	}
4095
4096	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
4097	if (err) {
4098		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
4099			 "reserved pool", ext4_calculate_resv_clusters(sb));
4100		goto failed_mount4a;
4101	}
4102
4103	err = ext4_setup_system_zone(sb);
4104	if (err) {
4105		ext4_msg(sb, KERN_ERR, "failed to initialize system "
4106			 "zone (%d)", err);
4107		goto failed_mount4a;
4108	}
4109
4110	ext4_ext_init(sb);
4111	err = ext4_mb_init(sb);
4112	if (err) {
4113		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4114			 err);
4115		goto failed_mount5;
4116	}
4117
4118	block = ext4_count_free_clusters(sb);
4119	ext4_free_blocks_count_set(sbi->s_es, 
4120				   EXT4_C2B(sbi, block));
4121	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4122				  GFP_KERNEL);
4123	if (!err) {
4124		unsigned long freei = ext4_count_free_inodes(sb);
4125		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4126		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4127					  GFP_KERNEL);
4128	}
4129	if (!err)
4130		err = percpu_counter_init(&sbi->s_dirs_counter,
4131					  ext4_count_dirs(sb), GFP_KERNEL);
4132	if (!err)
4133		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4134					  GFP_KERNEL);
4135	if (err) {
4136		ext4_msg(sb, KERN_ERR, "insufficient memory");
4137		goto failed_mount6;
4138	}
4139
4140	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
4141		if (!ext4_fill_flex_info(sb)) {
4142			ext4_msg(sb, KERN_ERR,
4143			       "unable to initialize "
4144			       "flex_bg meta info!");
4145			goto failed_mount6;
4146		}
4147
4148	err = ext4_register_li_request(sb, first_not_zeroed);
4149	if (err)
4150		goto failed_mount6;
4151
4152	sbi->s_kobj.kset = ext4_kset;
4153	init_completion(&sbi->s_kobj_unregister);
4154	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
4155				   "%s", sb->s_id);
4156	if (err)
4157		goto failed_mount7;
4158
4159#ifdef CONFIG_QUOTA
4160	/* Enable quota usage during mount. */
4161	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
4162	    !(sb->s_flags & MS_RDONLY)) {
4163		err = ext4_enable_quotas(sb);
4164		if (err)
4165			goto failed_mount8;
4166	}
4167#endif  /* CONFIG_QUOTA */
4168
4169	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
4170	ext4_orphan_cleanup(sb, es);
4171	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
4172	if (needs_recovery) {
4173		ext4_msg(sb, KERN_INFO, "recovery complete");
4174		ext4_mark_recovery_complete(sb, es);
4175	}
4176	if (EXT4_SB(sb)->s_journal) {
4177		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
4178			descr = " journalled data mode";
4179		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
4180			descr = " ordered data mode";
4181		else
4182			descr = " writeback data mode";
4183	} else
4184		descr = "out journal";
4185
4186	if (test_opt(sb, DISCARD)) {
4187		struct request_queue *q = bdev_get_queue(sb->s_bdev);
4188		if (!blk_queue_discard(q))
4189			ext4_msg(sb, KERN_WARNING,
4190				 "mounting with \"discard\" option, but "
4191				 "the device does not support discard");
4192	}
4193
4194	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4195		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
4196		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
4197
4198	if (es->s_error_count)
4199		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
4200
4201	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
4202	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
4203	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
4204	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
4205
4206	kfree(orig_data);
4207	return 0;
4208
4209cantfind_ext4:
4210	if (!silent)
4211		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
4212	goto failed_mount;
4213
4214#ifdef CONFIG_QUOTA
4215failed_mount8:
4216	kobject_del(&sbi->s_kobj);
4217#endif
4218failed_mount7:
4219	ext4_unregister_li_request(sb);
4220failed_mount6:
4221	ext4_mb_release(sb);
4222	if (sbi->s_flex_groups)
4223		ext4_kvfree(sbi->s_flex_groups);
4224	percpu_counter_destroy(&sbi->s_freeclusters_counter);
4225	percpu_counter_destroy(&sbi->s_freeinodes_counter);
4226	percpu_counter_destroy(&sbi->s_dirs_counter);
4227	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4228failed_mount5:
4229	ext4_ext_release(sb);
4230	ext4_release_system_zone(sb);
4231failed_mount4a:
4232	dput(sb->s_root);
4233	sb->s_root = NULL;
4234failed_mount4:
4235	ext4_msg(sb, KERN_ERR, "mount failed");
4236	if (EXT4_SB(sb)->rsv_conversion_wq)
4237		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4238failed_mount_wq:
4239	if (sbi->s_journal) {
4240		jbd2_journal_destroy(sbi->s_journal);
4241		sbi->s_journal = NULL;
4242	}
4243	ext4_es_unregister_shrinker(sbi);
4244failed_mount3:
4245	del_timer_sync(&sbi->s_err_report);
4246	if (sbi->s_mmp_tsk)
4247		kthread_stop(sbi->s_mmp_tsk);
4248failed_mount2:
4249	for (i = 0; i < db_count; i++)
4250		brelse(sbi->s_group_desc[i]);
4251	ext4_kvfree(sbi->s_group_desc);
4252failed_mount:
4253	if (sbi->s_chksum_driver)
4254		crypto_free_shash(sbi->s_chksum_driver);
4255	if (sbi->s_proc) {
4256		remove_proc_entry("options", sbi->s_proc);
4257		remove_proc_entry(sb->s_id, ext4_proc_root);
4258	}
4259#ifdef CONFIG_QUOTA
4260	for (i = 0; i < EXT4_MAXQUOTAS; i++)
4261		kfree(sbi->s_qf_names[i]);
4262#endif
4263	ext4_blkdev_remove(sbi);
4264	brelse(bh);
4265out_fail:
4266	sb->s_fs_info = NULL;
4267	kfree(sbi->s_blockgroup_lock);
4268	kfree(sbi);
4269out_free_orig:
4270	kfree(orig_data);
4271	return err ? err : ret;
4272}
4273
4274/*
4275 * Setup any per-fs journal parameters now.  We'll do this both on
4276 * initial mount, once the journal has been initialised but before we've
4277 * done any recovery; and again on any subsequent remount.
4278 */
4279static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
4280{
4281	struct ext4_sb_info *sbi = EXT4_SB(sb);
4282
4283	journal->j_commit_interval = sbi->s_commit_interval;
4284	journal->j_min_batch_time = sbi->s_min_batch_time;
4285	journal->j_max_batch_time = sbi->s_max_batch_time;
4286
4287	write_lock(&journal->j_state_lock);
4288	if (test_opt(sb, BARRIER))
4289		journal->j_flags |= JBD2_BARRIER;
4290	else
4291		journal->j_flags &= ~JBD2_BARRIER;
4292	if (test_opt(sb, DATA_ERR_ABORT))
4293		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
4294	else
4295		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
4296	write_unlock(&journal->j_state_lock);
4297}
4298
4299static journal_t *ext4_get_journal(struct super_block *sb,
4300				   unsigned int journal_inum)
4301{
4302	struct inode *journal_inode;
4303	journal_t *journal;
4304
4305	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4306
4307	/* First, test for the existence of a valid inode on disk.  Bad
4308	 * things happen if we iget() an unused inode, as the subsequent
4309	 * iput() will try to delete it. */
4310
4311	journal_inode = ext4_iget(sb, journal_inum);
4312	if (IS_ERR(journal_inode)) {
4313		ext4_msg(sb, KERN_ERR, "no journal found");
4314		return NULL;
4315	}
4316	if (!journal_inode->i_nlink) {
4317		make_bad_inode(journal_inode);
4318		iput(journal_inode);
4319		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
4320		return NULL;
4321	}
4322
4323	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
4324		  journal_inode, journal_inode->i_size);
4325	if (!S_ISREG(journal_inode->i_mode)) {
4326		ext4_msg(sb, KERN_ERR, "invalid journal inode");
4327		iput(journal_inode);
4328		return NULL;
4329	}
4330
4331	journal = jbd2_journal_init_inode(journal_inode);
4332	if (!journal) {
4333		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
4334		iput(journal_inode);
4335		return NULL;
4336	}
4337	journal->j_private = sb;
4338	ext4_init_journal_params(sb, journal);
4339	return journal;
4340}
4341
4342static journal_t *ext4_get_dev_journal(struct super_block *sb,
4343				       dev_t j_dev)
4344{
4345	struct buffer_head *bh;
4346	journal_t *journal;
4347	ext4_fsblk_t start;
4348	ext4_fsblk_t len;
4349	int hblock, blocksize;
4350	ext4_fsblk_t sb_block;
4351	unsigned long offset;
4352	struct ext4_super_block *es;
4353	struct block_device *bdev;
4354
4355	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4356
4357	bdev = ext4_blkdev_get(j_dev, sb);
4358	if (bdev == NULL)
4359		return NULL;
4360
4361	blocksize = sb->s_blocksize;
4362	hblock = bdev_logical_block_size(bdev);
4363	if (blocksize < hblock) {
4364		ext4_msg(sb, KERN_ERR,
4365			"blocksize too small for journal device");
4366		goto out_bdev;
4367	}
4368
4369	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
4370	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
4371	set_blocksize(bdev, blocksize);
4372	if (!(bh = __bread(bdev, sb_block, blocksize))) {
4373		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
4374		       "external journal");
4375		goto out_bdev;
4376	}
4377
4378	es = (struct ext4_super_block *) (bh->b_data + offset);
4379	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
4380	    !(le32_to_cpu(es->s_feature_incompat) &
4381	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
4382		ext4_msg(sb, KERN_ERR, "external journal has "
4383					"bad superblock");
4384		brelse(bh);
4385		goto out_bdev;
4386	}
4387
4388	if ((le32_to_cpu(es->s_feature_ro_compat) &
4389	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
4390	    es->s_checksum != ext4_superblock_csum(sb, es)) {
4391		ext4_msg(sb, KERN_ERR, "external journal has "
4392				       "corrupt superblock");
4393		brelse(bh);
4394		goto out_bdev;
4395	}
4396
4397	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
4398		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
4399		brelse(bh);
4400		goto out_bdev;
4401	}
4402
4403	len = ext4_blocks_count(es);
4404	start = sb_block + 1;
4405	brelse(bh);	/* we're done with the superblock */
4406
4407	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
4408					start, len, blocksize);
4409	if (!journal) {
4410		ext4_msg(sb, KERN_ERR, "failed to create device journal");
4411		goto out_bdev;
4412	}
4413	journal->j_private = sb;
4414	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
4415	wait_on_buffer(journal->j_sb_buffer);
4416	if (!buffer_uptodate(journal->j_sb_buffer)) {
4417		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
4418		goto out_journal;
4419	}
4420	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
4421		ext4_msg(sb, KERN_ERR, "External journal has more than one "
4422					"user (unsupported) - %d",
4423			be32_to_cpu(journal->j_superblock->s_nr_users));
4424		goto out_journal;
4425	}
4426	EXT4_SB(sb)->journal_bdev = bdev;
4427	ext4_init_journal_params(sb, journal);
4428	return journal;
4429
4430out_journal:
4431	jbd2_journal_destroy(journal);
4432out_bdev:
4433	ext4_blkdev_put(bdev);
4434	return NULL;
4435}
4436
4437static int ext4_load_journal(struct super_block *sb,
4438			     struct ext4_super_block *es,
4439			     unsigned long journal_devnum)
4440{
4441	journal_t *journal;
4442	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
4443	dev_t journal_dev;
4444	int err = 0;
4445	int really_read_only;
4446
4447	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4448
4449	if (journal_devnum &&
4450	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4451		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
4452			"numbers have changed");
4453		journal_dev = new_decode_dev(journal_devnum);
4454	} else
4455		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
4456
4457	really_read_only = bdev_read_only(sb->s_bdev);
4458
4459	/*
4460	 * Are we loading a blank journal or performing recovery after a
4461	 * crash?  For recovery, we need to check in advance whether we
4462	 * can get read-write access to the device.
4463	 */
4464	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
4465		if (sb->s_flags & MS_RDONLY) {
4466			ext4_msg(sb, KERN_INFO, "INFO: recovery "
4467					"required on readonly filesystem");
4468			if (really_read_only) {
4469				ext4_msg(sb, KERN_ERR, "write access "
4470					"unavailable, cannot proceed");
4471				return -EROFS;
4472			}
4473			ext4_msg(sb, KERN_INFO, "write access will "
4474			       "be enabled during recovery");
4475		}
4476	}
4477
4478	if (journal_inum && journal_dev) {
4479		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
4480		       "and inode journals!");
4481		return -EINVAL;
4482	}
4483
4484	if (journal_inum) {
4485		if (!(journal = ext4_get_journal(sb, journal_inum)))
4486			return -EINVAL;
4487	} else {
4488		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
4489			return -EINVAL;
4490	}
4491
4492	if (!(journal->j_flags & JBD2_BARRIER))
4493		ext4_msg(sb, KERN_INFO, "barriers disabled");
4494
4495	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4496		err = jbd2_journal_wipe(journal, !really_read_only);
4497	if (!err) {
4498		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
4499		if (save)
4500			memcpy(save, ((char *) es) +
4501			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
4502		err = jbd2_journal_load(journal);
4503		if (save)
4504			memcpy(((char *) es) + EXT4_S_ERR_START,
4505			       save, EXT4_S_ERR_LEN);
4506		kfree(save);
4507	}
4508
4509	if (err) {
4510		ext4_msg(sb, KERN_ERR, "error loading journal");
4511		jbd2_journal_destroy(journal);
4512		return err;
4513	}
4514
4515	EXT4_SB(sb)->s_journal = journal;
4516	ext4_clear_journal_err(sb, es);
4517
4518	if (!really_read_only && journal_devnum &&
4519	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4520		es->s_journal_dev = cpu_to_le32(journal_devnum);
4521
4522		/* Make sure we flush the recovery flag to disk. */
4523		ext4_commit_super(sb, 1);
4524	}
4525
4526	return 0;
4527}
4528
4529static int ext4_commit_super(struct super_block *sb, int sync)
4530{
4531	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4532	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4533	int error = 0;
4534
4535	if (!sbh || block_device_ejected(sb))
4536		return error;
4537	if (buffer_write_io_error(sbh)) {
4538		/*
4539		 * Oh, dear.  A previous attempt to write the
4540		 * superblock failed.  This could happen because the
4541		 * USB device was yanked out.  Or it could happen to
4542		 * be a transient write error and maybe the block will
4543		 * be remapped.  Nothing we can do but to retry the
4544		 * write and hope for the best.
4545		 */
4546		ext4_msg(sb, KERN_ERR, "previous I/O error to "
4547		       "superblock detected");
4548		clear_buffer_write_io_error(sbh);
4549		set_buffer_uptodate(sbh);
4550	}
4551	/*
4552	 * If the file system is mounted read-only, don't update the
4553	 * superblock write time.  This avoids updating the superblock
4554	 * write time when we are mounting the root file system
4555	 * read/only but we need to replay the journal; at that point,
4556	 * for people who are east of GMT and who make their clock
4557	 * tick in localtime for Windows bug-for-bug compatibility,
4558	 * the clock is set in the future, and this will cause e2fsck
4559	 * to complain and force a full file system check.
4560	 */
4561	if (!(sb->s_flags & MS_RDONLY))
4562		es->s_wtime = cpu_to_le32(get_seconds());
4563	if (sb->s_bdev->bd_part)
4564		es->s_kbytes_written =
4565			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4566			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
4567			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
4568	else
4569		es->s_kbytes_written =
4570			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4571	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
4572		ext4_free_blocks_count_set(es,
4573			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4574				&EXT4_SB(sb)->s_freeclusters_counter)));
4575	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
4576		es->s_free_inodes_count =
4577			cpu_to_le32(percpu_counter_sum_positive(
4578				&EXT4_SB(sb)->s_freeinodes_counter));
4579	BUFFER_TRACE(sbh, "marking dirty");
4580	ext4_superblock_csum_set(sb);
4581	mark_buffer_dirty(sbh);
4582	if (sync) {
4583		error = sync_dirty_buffer(sbh);
4584		if (error)
4585			return error;
4586
4587		error = buffer_write_io_error(sbh);
4588		if (error) {
4589			ext4_msg(sb, KERN_ERR, "I/O error while writing "
4590			       "superblock");
4591			clear_buffer_write_io_error(sbh);
4592			set_buffer_uptodate(sbh);
4593		}
4594	}
4595	return error;
4596}
4597
4598/*
4599 * Have we just finished recovery?  If so, and if we are mounting (or
4600 * remounting) the filesystem readonly, then we will end up with a
4601 * consistent fs on disk.  Record that fact.
4602 */
4603static void ext4_mark_recovery_complete(struct super_block *sb,
4604					struct ext4_super_block *es)
4605{
4606	journal_t *journal = EXT4_SB(sb)->s_journal;
4607
4608	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
4609		BUG_ON(journal != NULL);
4610		return;
4611	}
4612	jbd2_journal_lock_updates(journal);
4613	if (jbd2_journal_flush(journal) < 0)
4614		goto out;
4615
4616	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
4617	    sb->s_flags & MS_RDONLY) {
4618		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4619		ext4_commit_super(sb, 1);
4620	}
4621
4622out:
4623	jbd2_journal_unlock_updates(journal);
4624}
4625
4626/*
4627 * If we are mounting (or read-write remounting) a filesystem whose journal
4628 * has recorded an error from a previous lifetime, move that error to the
4629 * main filesystem now.
4630 */
4631static void ext4_clear_journal_err(struct super_block *sb,
4632				   struct ext4_super_block *es)
4633{
4634	journal_t *journal;
4635	int j_errno;
4636	const char *errstr;
4637
4638	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4639
4640	journal = EXT4_SB(sb)->s_journal;
4641
4642	/*
4643	 * Now check for any error status which may have been recorded in the
4644	 * journal by a prior ext4_error() or ext4_abort()
4645	 */
4646
4647	j_errno = jbd2_journal_errno(journal);
4648	if (j_errno) {
4649		char nbuf[16];
4650
4651		errstr = ext4_decode_error(sb, j_errno, nbuf);
4652		ext4_warning(sb, "Filesystem error recorded "
4653			     "from previous mount: %s", errstr);
4654		ext4_warning(sb, "Marking fs in need of filesystem check.");
4655
4656		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
4657		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
4658		ext4_commit_super(sb, 1);
4659
4660		jbd2_journal_clear_err(journal);
4661		jbd2_journal_update_sb_errno(journal);
4662	}
4663}
4664
4665/*
4666 * Force the running and committing transactions to commit,
4667 * and wait on the commit.
4668 */
4669int ext4_force_commit(struct super_block *sb)
4670{
4671	journal_t *journal;
4672
4673	if (sb->s_flags & MS_RDONLY)
4674		return 0;
4675
4676	journal = EXT4_SB(sb)->s_journal;
4677	return ext4_journal_force_commit(journal);
4678}
4679
4680static int ext4_sync_fs(struct super_block *sb, int wait)
4681{
4682	int ret = 0;
4683	tid_t target;
4684	bool needs_barrier = false;
4685	struct ext4_sb_info *sbi = EXT4_SB(sb);
4686
4687	trace_ext4_sync_fs(sb, wait);
4688	flush_workqueue(sbi->rsv_conversion_wq);
4689	/*
4690	 * Writeback quota in non-journalled quota case - journalled quota has
4691	 * no dirty dquots
4692	 */
4693	dquot_writeback_dquots(sb, -1);
4694	/*
4695	 * Data writeback is possible w/o journal transaction, so barrier must
4696	 * being sent at the end of the function. But we can skip it if
4697	 * transaction_commit will do it for us.
4698	 */
4699	if (sbi->s_journal) {
4700		target = jbd2_get_latest_transaction(sbi->s_journal);
4701		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4702		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4703			needs_barrier = true;
4704
4705		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4706			if (wait)
4707				ret = jbd2_log_wait_commit(sbi->s_journal,
4708							   target);
4709		}
4710	} else if (wait && test_opt(sb, BARRIER))
4711		needs_barrier = true;
4712	if (needs_barrier) {
4713		int err;
4714		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4715		if (!ret)
4716			ret = err;
4717	}
4718
4719	return ret;
4720}
4721
4722/*
4723 * LVM calls this function before a (read-only) snapshot is created.  This
4724 * gives us a chance to flush the journal completely and mark the fs clean.
4725 *
4726 * Note that only this function cannot bring a filesystem to be in a clean
4727 * state independently. It relies on upper layer to stop all data & metadata
4728 * modifications.
4729 */
4730static int ext4_freeze(struct super_block *sb)
4731{
4732	int error = 0;
4733	journal_t *journal;
4734
4735	if (sb->s_flags & MS_RDONLY)
4736		return 0;
4737
4738	journal = EXT4_SB(sb)->s_journal;
4739
4740	if (journal) {
4741		/* Now we set up the journal barrier. */
4742		jbd2_journal_lock_updates(journal);
4743
4744		/*
4745		 * Don't clear the needs_recovery flag if we failed to
4746		 * flush the journal.
4747		 */
4748		error = jbd2_journal_flush(journal);
4749		if (error < 0)
4750			goto out;
4751	}
4752
4753	/* Journal blocked and flushed, clear needs_recovery flag. */
4754	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4755	error = ext4_commit_super(sb, 1);
4756out:
4757	if (journal)
4758		/* we rely on upper layer to stop further updates */
4759		jbd2_journal_unlock_updates(journal);
4760	return error;
4761}
4762
4763/*
4764 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
4765 * flag here, even though the filesystem is not technically dirty yet.
4766 */
4767static int ext4_unfreeze(struct super_block *sb)
4768{
4769	if (sb->s_flags & MS_RDONLY)
4770		return 0;
4771
4772	/* Reset the needs_recovery flag before the fs is unlocked. */
4773	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4774	ext4_commit_super(sb, 1);
4775	return 0;
4776}
4777
4778/*
4779 * Structure to save mount options for ext4_remount's benefit
4780 */
4781struct ext4_mount_options {
4782	unsigned long s_mount_opt;
4783	unsigned long s_mount_opt2;
4784	kuid_t s_resuid;
4785	kgid_t s_resgid;
4786	unsigned long s_commit_interval;
4787	u32 s_min_batch_time, s_max_batch_time;
4788#ifdef CONFIG_QUOTA
4789	int s_jquota_fmt;
4790	char *s_qf_names[EXT4_MAXQUOTAS];
4791#endif
4792};
4793
4794static int ext4_remount(struct super_block *sb, int *flags, char *data)
4795{
4796	struct ext4_super_block *es;
4797	struct ext4_sb_info *sbi = EXT4_SB(sb);
4798	unsigned long old_sb_flags;
4799	struct ext4_mount_options old_opts;
4800	int enable_quota = 0;
4801	ext4_group_t g;
4802	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4803	int err = 0;
4804#ifdef CONFIG_QUOTA
4805	int i, j;
4806#endif
4807	char *orig_data = kstrdup(data, GFP_KERNEL);
4808
4809	/* Store the original options */
4810	old_sb_flags = sb->s_flags;
4811	old_opts.s_mount_opt = sbi->s_mount_opt;
4812	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
4813	old_opts.s_resuid = sbi->s_resuid;
4814	old_opts.s_resgid = sbi->s_resgid;
4815	old_opts.s_commit_interval = sbi->s_commit_interval;
4816	old_opts.s_min_batch_time = sbi->s_min_batch_time;
4817	old_opts.s_max_batch_time = sbi->s_max_batch_time;
4818#ifdef CONFIG_QUOTA
4819	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
4820	for (i = 0; i < EXT4_MAXQUOTAS; i++)
4821		if (sbi->s_qf_names[i]) {
4822			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
4823							 GFP_KERNEL);
4824			if (!old_opts.s_qf_names[i]) {
4825				for (j = 0; j < i; j++)
4826					kfree(old_opts.s_qf_names[j]);
4827				kfree(orig_data);
4828				return -ENOMEM;
4829			}
4830		} else
4831			old_opts.s_qf_names[i] = NULL;
4832#endif
4833	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
4834		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
4835
4836	/*
4837	 * Allow the "check" option to be passed as a remount option.
4838	 */
4839	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
4840		err = -EINVAL;
4841		goto restore_opts;
4842	}
4843
4844	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4845		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4846			ext4_msg(sb, KERN_ERR, "can't mount with "
4847				 "both data=journal and delalloc");
4848			err = -EINVAL;
4849			goto restore_opts;
4850		}
4851		if (test_opt(sb, DIOREAD_NOLOCK)) {
4852			ext4_msg(sb, KERN_ERR, "can't mount with "
4853				 "both data=journal and dioread_nolock");
4854			err = -EINVAL;
4855			goto restore_opts;
4856		}
4857	}
4858
4859	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
4860		ext4_abort(sb, "Abort forced by user");
4861
4862	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
4863		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
4864
4865	es = sbi->s_es;
4866
4867	if (sbi->s_journal) {
4868		ext4_init_journal_params(sb, sbi->s_journal);
4869		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4870	}
4871
4872	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
4873		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4874			err = -EROFS;
4875			goto restore_opts;
4876		}
4877
4878		if (*flags & MS_RDONLY) {
4879			err = sync_filesystem(sb);
4880			if (err < 0)
4881				goto restore_opts;
4882			err = dquot_suspend(sb, -1);
4883			if (err < 0)
4884				goto restore_opts;
4885
4886			/*
4887			 * First of all, the unconditional stuff we have to do
4888			 * to disable replay of the journal when we next remount
4889			 */
4890			sb->s_flags |= MS_RDONLY;
4891
4892			/*
4893			 * OK, test if we are remounting a valid rw partition
4894			 * readonly, and if so set the rdonly flag and then
4895			 * mark the partition as valid again.
4896			 */
4897			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
4898			    (sbi->s_mount_state & EXT4_VALID_FS))
4899				es->s_state = cpu_to_le16(sbi->s_mount_state);
4900
4901			if (sbi->s_journal)
4902				ext4_mark_recovery_complete(sb, es);
4903		} else {
4904			/* Make sure we can mount this feature set readwrite */
4905			if (!ext4_feature_set_ok(sb, 0)) {
4906				err = -EROFS;
4907				goto restore_opts;
4908			}
4909			/*
4910			 * Make sure the group descriptor checksums
4911			 * are sane.  If they aren't, refuse to remount r/w.
4912			 */
4913			for (g = 0; g < sbi->s_groups_count; g++) {
4914				struct ext4_group_desc *gdp =
4915					ext4_get_group_desc(sb, g, NULL);
4916
4917				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
4918					ext4_msg(sb, KERN_ERR,
4919	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
4920		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
4921					       le16_to_cpu(gdp->bg_checksum));
4922					err = -EINVAL;
4923					goto restore_opts;
4924				}
4925			}
4926
4927			/*
4928			 * If we have an unprocessed orphan list hanging
4929			 * around from a previously readonly bdev mount,
4930			 * require a full umount/remount for now.
4931			 */
4932			if (es->s_last_orphan) {
4933				ext4_msg(sb, KERN_WARNING, "Couldn't "
4934				       "remount RDWR because of unprocessed "
4935				       "orphan inode list.  Please "
4936				       "umount/remount instead");
4937				err = -EINVAL;
4938				goto restore_opts;
4939			}
4940
4941			/*
4942			 * Mounting a RDONLY partition read-write, so reread
4943			 * and store the current valid flag.  (It may have
4944			 * been changed by e2fsck since we originally mounted
4945			 * the partition.)
4946			 */
4947			if (sbi->s_journal)
4948				ext4_clear_journal_err(sb, es);
4949			sbi->s_mount_state = le16_to_cpu(es->s_state);
4950			if (!ext4_setup_super(sb, es, 0))
4951				sb->s_flags &= ~MS_RDONLY;
4952			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4953						     EXT4_FEATURE_INCOMPAT_MMP))
4954				if (ext4_multi_mount_protect(sb,
4955						le64_to_cpu(es->s_mmp_block))) {
4956					err = -EROFS;
4957					goto restore_opts;
4958				}
4959			enable_quota = 1;
4960		}
4961	}
4962
4963	/*
4964	 * Reinitialize lazy itable initialization thread based on
4965	 * current settings
4966	 */
4967	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4968		ext4_unregister_li_request(sb);
4969	else {
4970		ext4_group_t first_not_zeroed;
4971		first_not_zeroed = ext4_has_uninit_itable(sb);
4972		ext4_register_li_request(sb, first_not_zeroed);
4973	}
4974
4975	ext4_setup_system_zone(sb);
4976	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
4977		ext4_commit_super(sb, 1);
4978
4979#ifdef CONFIG_QUOTA
4980	/* Release old quota file names */
4981	for (i = 0; i < EXT4_MAXQUOTAS; i++)
4982		kfree(old_opts.s_qf_names[i]);
4983	if (enable_quota) {
4984		if (sb_any_quota_suspended(sb))
4985			dquot_resume(sb, -1);
4986		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4987					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
4988			err = ext4_enable_quotas(sb);
4989			if (err)
4990				goto restore_opts;
4991		}
4992	}
4993#endif
4994
4995	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
4996	kfree(orig_data);
4997	return 0;
4998
4999restore_opts:
5000	sb->s_flags = old_sb_flags;
5001	sbi->s_mount_opt = old_opts.s_mount_opt;
5002	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
5003	sbi->s_resuid = old_opts.s_resuid;
5004	sbi->s_resgid = old_opts.s_resgid;
5005	sbi->s_commit_interval = old_opts.s_commit_interval;
5006	sbi->s_min_batch_time = old_opts.s_min_batch_time;
5007	sbi->s_max_batch_time = old_opts.s_max_batch_time;
5008#ifdef CONFIG_QUOTA
5009	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
5010	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
5011		kfree(sbi->s_qf_names[i]);
5012		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
5013	}
5014#endif
5015	kfree(orig_data);
5016	return err;
5017}
5018
5019static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
5020{
5021	struct super_block *sb = dentry->d_sb;
5022	struct ext4_sb_info *sbi = EXT4_SB(sb);
5023	struct ext4_super_block *es = sbi->s_es;
5024	ext4_fsblk_t overhead = 0, resv_blocks;
5025	u64 fsid;
5026	s64 bfree;
5027	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
5028
5029	if (!test_opt(sb, MINIX_DF))
5030		overhead = sbi->s_overhead;
5031
5032	buf->f_type = EXT4_SUPER_MAGIC;
5033	buf->f_bsize = sb->s_blocksize;
5034	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
5035	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
5036		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
5037	/* prevent underflow in case that few free space is available */
5038	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
5039	buf->f_bavail = buf->f_bfree -
5040			(ext4_r_blocks_count(es) + resv_blocks);
5041	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
5042		buf->f_bavail = 0;
5043	buf->f_files = le32_to_cpu(es->s_inodes_count);
5044	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
5045	buf->f_namelen = EXT4_NAME_LEN;
5046	fsid = le64_to_cpup((void *)es->s_uuid) ^
5047	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
5048	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
5049	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
5050
5051	return 0;
5052}
5053
5054/* Helper function for writing quotas on sync - we need to start transaction
5055 * before quota file is locked for write. Otherwise the are possible deadlocks:
5056 * Process 1                         Process 2
5057 * ext4_create()                     quota_sync()
5058 *   jbd2_journal_start()                  write_dquot()
5059 *   dquot_initialize()                         down(dqio_mutex)
5060 *     down(dqio_mutex)                    jbd2_journal_start()
5061 *
5062 */
5063
5064#ifdef CONFIG_QUOTA
5065
5066static inline struct inode *dquot_to_inode(struct dquot *dquot)
5067{
5068	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
5069}
5070
5071static int ext4_write_dquot(struct dquot *dquot)
5072{
5073	int ret, err;
5074	handle_t *handle;
5075	struct inode *inode;
5076
5077	inode = dquot_to_inode(dquot);
5078	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5079				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
5080	if (IS_ERR(handle))
5081		return PTR_ERR(handle);
5082	ret = dquot_commit(dquot);
5083	err = ext4_journal_stop(handle);
5084	if (!ret)
5085		ret = err;
5086	return ret;
5087}
5088
5089static int ext4_acquire_dquot(struct dquot *dquot)
5090{
5091	int ret, err;
5092	handle_t *handle;
5093
5094	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5095				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
5096	if (IS_ERR(handle))
5097		return PTR_ERR(handle);
5098	ret = dquot_acquire(dquot);
5099	err = ext4_journal_stop(handle);
5100	if (!ret)
5101		ret = err;
5102	return ret;
5103}
5104
5105static int ext4_release_dquot(struct dquot *dquot)
5106{
5107	int ret, err;
5108	handle_t *handle;
5109
5110	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5111				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
5112	if (IS_ERR(handle)) {
5113		/* Release dquot anyway to avoid endless cycle in dqput() */
5114		dquot_release(dquot);
5115		return PTR_ERR(handle);
5116	}
5117	ret = dquot_release(dquot);
5118	err = ext4_journal_stop(handle);
5119	if (!ret)
5120		ret = err;
5121	return ret;
5122}
5123
5124static int ext4_mark_dquot_dirty(struct dquot *dquot)
5125{
5126	struct super_block *sb = dquot->dq_sb;
5127	struct ext4_sb_info *sbi = EXT4_SB(sb);
5128
5129	/* Are we journaling quotas? */
5130	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
5131	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
5132		dquot_mark_dquot_dirty(dquot);
5133		return ext4_write_dquot(dquot);
5134	} else {
5135		return dquot_mark_dquot_dirty(dquot);
5136	}
5137}
5138
5139static int ext4_write_info(struct super_block *sb, int type)
5140{
5141	int ret, err;
5142	handle_t *handle;
5143
5144	/* Data block + inode block */
5145	handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
5146	if (IS_ERR(handle))
5147		return PTR_ERR(handle);
5148	ret = dquot_commit_info(sb, type);
5149	err = ext4_journal_stop(handle);
5150	if (!ret)
5151		ret = err;
5152	return ret;
5153}
5154
5155/*
5156 * Turn on quotas during mount time - we need to find
5157 * the quota file and such...
5158 */
5159static int ext4_quota_on_mount(struct super_block *sb, int type)
5160{
5161	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
5162					EXT4_SB(sb)->s_jquota_fmt, type);
5163}
5164
5165/*
5166 * Standard function to be called on quota_on
5167 */
5168static int ext4_quota_on(struct super_block *sb, int type, int format_id,
5169			 struct path *path)
5170{
5171	int err;
5172
5173	if (!test_opt(sb, QUOTA))
5174		return -EINVAL;
5175
5176	/* Quotafile not on the same filesystem? */
5177	if (path->dentry->d_sb != sb)
5178		return -EXDEV;
5179	/* Journaling quota? */
5180	if (EXT4_SB(sb)->s_qf_names[type]) {
5181		/* Quotafile not in fs root? */
5182		if (path->dentry->d_parent != sb->s_root)
5183			ext4_msg(sb, KERN_WARNING,
5184				"Quota file not on filesystem root. "
5185				"Journaled quota will not work");
5186	}
5187
5188	/*
5189	 * When we journal data on quota file, we have to flush journal to see
5190	 * all updates to the file when we bypass pagecache...
5191	 */
5192	if (EXT4_SB(sb)->s_journal &&
5193	    ext4_should_journal_data(path->dentry->d_inode)) {
5194		/*
5195		 * We don't need to lock updates but journal_flush() could
5196		 * otherwise be livelocked...
5197		 */
5198		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
5199		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
5200		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
5201		if (err)
5202			return err;
5203	}
5204
5205	return dquot_quota_on(sb, type, format_id, path);
5206}
5207
5208static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5209			     unsigned int flags)
5210{
5211	int err;
5212	struct inode *qf_inode;
5213	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5214		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5215		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5216	};
5217
5218	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
5219
5220	if (!qf_inums[type])
5221		return -EPERM;
5222
5223	qf_inode = ext4_iget(sb, qf_inums[type]);
5224	if (IS_ERR(qf_inode)) {
5225		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
5226		return PTR_ERR(qf_inode);
5227	}
5228
5229	/* Don't account quota for quota files to avoid recursion */
5230	qf_inode->i_flags |= S_NOQUOTA;
5231	err = dquot_enable(qf_inode, type, format_id, flags);
5232	iput(qf_inode);
5233
5234	return err;
5235}
5236
5237/* Enable usage tracking for all quota types. */
5238static int ext4_enable_quotas(struct super_block *sb)
5239{
5240	int type, err = 0;
5241	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5242		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5243		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5244	};
5245
5246	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
5247	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
5248		if (qf_inums[type]) {
5249			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
5250						DQUOT_USAGE_ENABLED);
5251			if (err) {
5252				ext4_warning(sb,
5253					"Failed to enable quota tracking "
5254					"(type=%d, err=%d). Please run "
5255					"e2fsck to fix.", type, err);
5256				return err;
5257			}
5258		}
5259	}
5260	return 0;
5261}
5262
5263/*
5264 * quota_on function that is used when QUOTA feature is set.
5265 */
5266static int ext4_quota_on_sysfile(struct super_block *sb, int type,
5267				 int format_id)
5268{
5269	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5270		return -EINVAL;
5271
5272	/*
5273	 * USAGE was enabled at mount time. Only need to enable LIMITS now.
5274	 */
5275	return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
5276}
5277
5278static int ext4_quota_off(struct super_block *sb, int type)
5279{
5280	struct inode *inode = sb_dqopt(sb)->files[type];
5281	handle_t *handle;
5282
5283	/* Force all delayed allocation blocks to be allocated.
5284	 * Caller already holds s_umount sem */
5285	if (test_opt(sb, DELALLOC))
5286		sync_filesystem(sb);
5287
5288	if (!inode)
5289		goto out;
5290
5291	/* Update modification times of quota files when userspace can
5292	 * start looking at them */
5293	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
5294	if (IS_ERR(handle))
5295		goto out;
5296	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
5297	ext4_mark_inode_dirty(handle, inode);
5298	ext4_journal_stop(handle);
5299
5300out:
5301	return dquot_quota_off(sb, type);
5302}
5303
5304/*
5305 * quota_off function that is used when QUOTA feature is set.
5306 */
5307static int ext4_quota_off_sysfile(struct super_block *sb, int type)
5308{
5309	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5310		return -EINVAL;
5311
5312	/* Disable only the limits. */
5313	return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
5314}
5315
5316/* Read data from quotafile - avoid pagecache and such because we cannot afford
5317 * acquiring the locks... As quota files are never truncated and quota code
5318 * itself serializes the operations (and no one else should touch the files)
5319 * we don't have to be afraid of races */
5320static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
5321			       size_t len, loff_t off)
5322{
5323	struct inode *inode = sb_dqopt(sb)->files[type];
5324	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5325	int offset = off & (sb->s_blocksize - 1);
5326	int tocopy;
5327	size_t toread;
5328	struct buffer_head *bh;
5329	loff_t i_size = i_size_read(inode);
5330
5331	if (off > i_size)
5332		return 0;
5333	if (off+len > i_size)
5334		len = i_size-off;
5335	toread = len;
5336	while (toread > 0) {
5337		tocopy = sb->s_blocksize - offset < toread ?
5338				sb->s_blocksize - offset : toread;
5339		bh = ext4_bread(NULL, inode, blk, 0);
5340		if (IS_ERR(bh))
5341			return PTR_ERR(bh);
5342		if (!bh)	/* A hole? */
5343			memset(data, 0, tocopy);
5344		else
5345			memcpy(data, bh->b_data+offset, tocopy);
5346		brelse(bh);
5347		offset = 0;
5348		toread -= tocopy;
5349		data += tocopy;
5350		blk++;
5351	}
5352	return len;
5353}
5354
5355/* Write to quotafile (we know the transaction is already started and has
5356 * enough credits) */
5357static ssize_t ext4_quota_write(struct super_block *sb, int type,
5358				const char *data, size_t len, loff_t off)
5359{
5360	struct inode *inode = sb_dqopt(sb)->files[type];
5361	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5362	int err, offset = off & (sb->s_blocksize - 1);
5363	struct buffer_head *bh;
5364	handle_t *handle = journal_current_handle();
5365
5366	if (EXT4_SB(sb)->s_journal && !handle) {
5367		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5368			" cancelled because transaction is not started",
5369			(unsigned long long)off, (unsigned long long)len);
5370		return -EIO;
5371	}
5372	/*
5373	 * Since we account only one data block in transaction credits,
5374	 * then it is impossible to cross a block boundary.
5375	 */
5376	if (sb->s_blocksize - offset < len) {
5377		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5378			" cancelled because not block aligned",
5379			(unsigned long long)off, (unsigned long long)len);
5380		return -EIO;
5381	}
5382
5383	bh = ext4_bread(handle, inode, blk, 1);
5384	if (IS_ERR(bh))
5385		return PTR_ERR(bh);
5386	if (!bh)
5387		goto out;
5388	BUFFER_TRACE(bh, "get write access");
5389	err = ext4_journal_get_write_access(handle, bh);
5390	if (err) {
5391		brelse(bh);
5392		return err;
5393	}
5394	lock_buffer(bh);
5395	memcpy(bh->b_data+offset, data, len);
5396	flush_dcache_page(bh->b_page);
5397	unlock_buffer(bh);
5398	err = ext4_handle_dirty_metadata(handle, NULL, bh);
5399	brelse(bh);
5400out:
5401	if (inode->i_size < off + len) {
5402		i_size_write(inode, off + len);
5403		EXT4_I(inode)->i_disksize = inode->i_size;
5404		ext4_mark_inode_dirty(handle, inode);
5405	}
5406	return len;
5407}
5408
5409#endif
5410
5411static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
5412		       const char *dev_name, void *data)
5413{
5414	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
5415}
5416
5417#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
5418static inline void register_as_ext2(void)
5419{
5420	int err = register_filesystem(&ext2_fs_type);
5421	if (err)
5422		printk(KERN_WARNING
5423		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
5424}
5425
5426static inline void unregister_as_ext2(void)
5427{
5428	unregister_filesystem(&ext2_fs_type);
5429}
5430
5431static inline int ext2_feature_set_ok(struct super_block *sb)
5432{
5433	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
5434		return 0;
5435	if (sb->s_flags & MS_RDONLY)
5436		return 1;
5437	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
5438		return 0;
5439	return 1;
5440}
5441#else
5442static inline void register_as_ext2(void) { }
5443static inline void unregister_as_ext2(void) { }
5444static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
5445#endif
5446
5447#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
5448static inline void register_as_ext3(void)
5449{
5450	int err = register_filesystem(&ext3_fs_type);
5451	if (err)
5452		printk(KERN_WARNING
5453		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
5454}
5455
5456static inline void unregister_as_ext3(void)
5457{
5458	unregister_filesystem(&ext3_fs_type);
5459}
5460
5461static inline int ext3_feature_set_ok(struct super_block *sb)
5462{
5463	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
5464		return 0;
5465	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
5466		return 0;
5467	if (sb->s_flags & MS_RDONLY)
5468		return 1;
5469	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
5470		return 0;
5471	return 1;
5472}
5473#else
5474static inline void register_as_ext3(void) { }
5475static inline void unregister_as_ext3(void) { }
5476static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
5477#endif
5478
5479static struct file_system_type ext4_fs_type = {
5480	.owner		= THIS_MODULE,
5481	.name		= "ext4",
5482	.mount		= ext4_mount,
5483	.kill_sb	= kill_block_super,
5484	.fs_flags	= FS_REQUIRES_DEV,
5485};
5486MODULE_ALIAS_FS("ext4");
5487
5488static int __init ext4_init_feat_adverts(void)
5489{
5490	struct ext4_features *ef;
5491	int ret = -ENOMEM;
5492
5493	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
5494	if (!ef)
5495		goto out;
5496
5497	ef->f_kobj.kset = ext4_kset;
5498	init_completion(&ef->f_kobj_unregister);
5499	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
5500				   "features");
5501	if (ret) {
5502		kfree(ef);
5503		goto out;
5504	}
5505
5506	ext4_feat = ef;
5507	ret = 0;
5508out:
5509	return ret;
5510}
5511
5512static void ext4_exit_feat_adverts(void)
5513{
5514	kobject_put(&ext4_feat->f_kobj);
5515	wait_for_completion(&ext4_feat->f_kobj_unregister);
5516	kfree(ext4_feat);
5517}
5518
5519/* Shared across all ext4 file systems */
5520wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
5521struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
5522
5523static int __init ext4_init_fs(void)
5524{
5525	int i, err;
5526
5527	ext4_li_info = NULL;
5528	mutex_init(&ext4_li_mtx);
5529
5530	/* Build-time check for flags consistency */
5531	ext4_check_flag_values();
5532
5533	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
5534		mutex_init(&ext4__aio_mutex[i]);
5535		init_waitqueue_head(&ext4__ioend_wq[i]);
5536	}
5537
5538	err = ext4_init_es();
5539	if (err)
5540		return err;
5541
5542	err = ext4_init_pageio();
5543	if (err)
5544		goto out7;
5545
5546	err = ext4_init_system_zone();
5547	if (err)
5548		goto out6;
5549	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
5550	if (!ext4_kset) {
5551		err = -ENOMEM;
5552		goto out5;
5553	}
5554	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
5555
5556	err = ext4_init_feat_adverts();
5557	if (err)
5558		goto out4;
5559
5560	err = ext4_init_mballoc();
5561	if (err)
5562		goto out2;
5563	else
5564		ext4_mballoc_ready = 1;
5565	err = init_inodecache();
5566	if (err)
5567		goto out1;
5568	register_as_ext3();
5569	register_as_ext2();
5570	err = register_filesystem(&ext4_fs_type);
5571	if (err)
5572		goto out;
5573
5574	return 0;
5575out:
5576	unregister_as_ext2();
5577	unregister_as_ext3();
5578	destroy_inodecache();
5579out1:
5580	ext4_mballoc_ready = 0;
5581	ext4_exit_mballoc();
5582out2:
5583	ext4_exit_feat_adverts();
5584out4:
5585	if (ext4_proc_root)
5586		remove_proc_entry("fs/ext4", NULL);
5587	kset_unregister(ext4_kset);
5588out5:
5589	ext4_exit_system_zone();
5590out6:
5591	ext4_exit_pageio();
5592out7:
5593	ext4_exit_es();
5594
5595	return err;
5596}
5597
5598static void __exit ext4_exit_fs(void)
5599{
5600	ext4_destroy_lazyinit_thread();
5601	unregister_as_ext2();
5602	unregister_as_ext3();
5603	unregister_filesystem(&ext4_fs_type);
5604	destroy_inodecache();
5605	ext4_exit_mballoc();
5606	ext4_exit_feat_adverts();
5607	remove_proc_entry("fs/ext4", NULL);
5608	kset_unregister(ext4_kset);
5609	ext4_exit_system_zone();
5610	ext4_exit_pageio();
5611	ext4_exit_es();
5612}
5613
5614MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
5615MODULE_DESCRIPTION("Fourth Extended Filesystem");
5616MODULE_LICENSE("GPL");
5617module_init(ext4_init_fs)
5618module_exit(ext4_exit_fs)