fs/ext4/ialloc.c at v3.18-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ext4 / ialloc.c
at v3.18-rc2 1324 lines 38 kB view raw
   1/*
   2 *  linux/fs/ext4/ialloc.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  BSD ufs-inspired inode and directory allocation by
  10 *  Stephen Tweedie (sct@redhat.com), 1993
  11 *  Big-endian to little-endian byte-swapping/bitmaps by
  12 *        David S. Miller (davem@caip.rutgers.edu), 1995
  13 */
  14
  15#include <linux/time.h>
  16#include <linux/fs.h>
  17#include <linux/jbd2.h>
  18#include <linux/stat.h>
  19#include <linux/string.h>
  20#include <linux/quotaops.h>
  21#include <linux/buffer_head.h>
  22#include <linux/random.h>
  23#include <linux/bitops.h>
  24#include <linux/blkdev.h>
  25#include <asm/byteorder.h>
  26
  27#include "ext4.h"
  28#include "ext4_jbd2.h"
  29#include "xattr.h"
  30#include "acl.h"
  31
  32#include <trace/events/ext4.h>
  33
  34/*
  35 * ialloc.c contains the inodes allocation and deallocation routines
  36 */
  37
  38/*
  39 * The free inodes are managed by bitmaps.  A file system contains several
  40 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
  41 * block for inodes, N blocks for the inode table and data blocks.
  42 *
  43 * The file system contains group descriptors which are located after the
  44 * super block.  Each descriptor contains the number of the bitmap block and
  45 * the free blocks count in the block.
  46 */
  47
  48/*
  49 * To avoid calling the atomic setbit hundreds or thousands of times, we only
  50 * need to use it within a single byte (to ensure we get endianness right).
  51 * We can use memset for the rest of the bitmap as there are no other users.
  52 */
  53void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
  54{
  55	int i;
  56
  57	if (start_bit >= end_bit)
  58		return;
  59
  60	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
  61	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
  62		ext4_set_bit(i, bitmap);
  63	if (i < end_bit)
  64		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
  65}
  66
  67/* Initializes an uninitialized inode bitmap */
  68static unsigned ext4_init_inode_bitmap(struct super_block *sb,
  69				       struct buffer_head *bh,
  70				       ext4_group_t block_group,
  71				       struct ext4_group_desc *gdp)
  72{
  73	struct ext4_group_info *grp;
  74	struct ext4_sb_info *sbi = EXT4_SB(sb);
  75	J_ASSERT_BH(bh, buffer_locked(bh));
  76
  77	/* If checksum is bad mark all blocks and inodes use to prevent
  78	 * allocation, essentially implementing a per-group read-only flag. */
  79	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
  80		ext4_error(sb, "Checksum bad for group %u", block_group);
  81		grp = ext4_get_group_info(sb, block_group);
  82		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
  83			percpu_counter_sub(&sbi->s_freeclusters_counter,
  84					   grp->bb_free);
  85		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
  86		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
  87			int count;
  88			count = ext4_free_inodes_count(sb, gdp);
  89			percpu_counter_sub(&sbi->s_freeinodes_counter,
  90					   count);
  91		}
  92		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
  93		return 0;
  94	}
  95
  96	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
  97	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
  98			bh->b_data);
  99	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
 100				   EXT4_INODES_PER_GROUP(sb) / 8);
 101	ext4_group_desc_csum_set(sb, block_group, gdp);
 102
 103	return EXT4_INODES_PER_GROUP(sb);
 104}
 105
 106void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
 107{
 108	if (uptodate) {
 109		set_buffer_uptodate(bh);
 110		set_bitmap_uptodate(bh);
 111	}
 112	unlock_buffer(bh);
 113	put_bh(bh);
 114}
 115
 116/*
 117 * Read the inode allocation bitmap for a given block_group, reading
 118 * into the specified slot in the superblock's bitmap cache.
 119 *
 120 * Return buffer_head of bitmap on success or NULL.
 121 */
 122static struct buffer_head *
 123ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 124{
 125	struct ext4_group_desc *desc;
 126	struct buffer_head *bh = NULL;
 127	ext4_fsblk_t bitmap_blk;
 128	struct ext4_group_info *grp;
 129	struct ext4_sb_info *sbi = EXT4_SB(sb);
 130
 131	desc = ext4_get_group_desc(sb, block_group, NULL);
 132	if (!desc)
 133		return NULL;
 134
 135	bitmap_blk = ext4_inode_bitmap(sb, desc);
 136	bh = sb_getblk(sb, bitmap_blk);
 137	if (unlikely(!bh)) {
 138		ext4_error(sb, "Cannot read inode bitmap - "
 139			    "block_group = %u, inode_bitmap = %llu",
 140			    block_group, bitmap_blk);
 141		return NULL;
 142	}
 143	if (bitmap_uptodate(bh))
 144		goto verify;
 145
 146	lock_buffer(bh);
 147	if (bitmap_uptodate(bh)) {
 148		unlock_buffer(bh);
 149		goto verify;
 150	}
 151
 152	ext4_lock_group(sb, block_group);
 153	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 154		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 155		set_bitmap_uptodate(bh);
 156		set_buffer_uptodate(bh);
 157		set_buffer_verified(bh);
 158		ext4_unlock_group(sb, block_group);
 159		unlock_buffer(bh);
 160		return bh;
 161	}
 162	ext4_unlock_group(sb, block_group);
 163
 164	if (buffer_uptodate(bh)) {
 165		/*
 166		 * if not uninit if bh is uptodate,
 167		 * bitmap is also uptodate
 168		 */
 169		set_bitmap_uptodate(bh);
 170		unlock_buffer(bh);
 171		goto verify;
 172	}
 173	/*
 174	 * submit the buffer_head for reading
 175	 */
 176	trace_ext4_load_inode_bitmap(sb, block_group);
 177	bh->b_end_io = ext4_end_bitmap_read;
 178	get_bh(bh);
 179	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 180	wait_on_buffer(bh);
 181	if (!buffer_uptodate(bh)) {
 182		put_bh(bh);
 183		ext4_error(sb, "Cannot read inode bitmap - "
 184			   "block_group = %u, inode_bitmap = %llu",
 185			   block_group, bitmap_blk);
 186		return NULL;
 187	}
 188
 189verify:
 190	ext4_lock_group(sb, block_group);
 191	if (!buffer_verified(bh) &&
 192	    !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
 193					   EXT4_INODES_PER_GROUP(sb) / 8)) {
 194		ext4_unlock_group(sb, block_group);
 195		put_bh(bh);
 196		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
 197			   "inode_bitmap = %llu", block_group, bitmap_blk);
 198		grp = ext4_get_group_info(sb, block_group);
 199		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
 200			int count;
 201			count = ext4_free_inodes_count(sb, desc);
 202			percpu_counter_sub(&sbi->s_freeinodes_counter,
 203					   count);
 204		}
 205		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
 206		return NULL;
 207	}
 208	ext4_unlock_group(sb, block_group);
 209	set_buffer_verified(bh);
 210	return bh;
 211}
 212
 213/*
 214 * NOTE! When we get the inode, we're the only people
 215 * that have access to it, and as such there are no
 216 * race conditions we have to worry about. The inode
 217 * is not on the hash-lists, and it cannot be reached
 218 * through the filesystem because the directory entry
 219 * has been deleted earlier.
 220 *
 221 * HOWEVER: we must make sure that we get no aliases,
 222 * which means that we have to call "clear_inode()"
 223 * _before_ we mark the inode not in use in the inode
 224 * bitmaps. Otherwise a newly created file might use
 225 * the same inode number (not actually the same pointer
 226 * though), and then we'd have two inodes sharing the
 227 * same inode number and space on the harddisk.
 228 */
 229void ext4_free_inode(handle_t *handle, struct inode *inode)
 230{
 231	struct super_block *sb = inode->i_sb;
 232	int is_directory;
 233	unsigned long ino;
 234	struct buffer_head *bitmap_bh = NULL;
 235	struct buffer_head *bh2;
 236	ext4_group_t block_group;
 237	unsigned long bit;
 238	struct ext4_group_desc *gdp;
 239	struct ext4_super_block *es;
 240	struct ext4_sb_info *sbi;
 241	int fatal = 0, err, count, cleared;
 242	struct ext4_group_info *grp;
 243
 244	if (!sb) {
 245		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
 246		       "nonexistent device\n", __func__, __LINE__);
 247		return;
 248	}
 249	if (atomic_read(&inode->i_count) > 1) {
 250		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
 251			 __func__, __LINE__, inode->i_ino,
 252			 atomic_read(&inode->i_count));
 253		return;
 254	}
 255	if (inode->i_nlink) {
 256		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
 257			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
 258		return;
 259	}
 260	sbi = EXT4_SB(sb);
 261
 262	ino = inode->i_ino;
 263	ext4_debug("freeing inode %lu\n", ino);
 264	trace_ext4_free_inode(inode);
 265
 266	/*
 267	 * Note: we must free any quota before locking the superblock,
 268	 * as writing the quota to disk may need the lock as well.
 269	 */
 270	dquot_initialize(inode);
 271	ext4_xattr_delete_inode(handle, inode);
 272	dquot_free_inode(inode);
 273	dquot_drop(inode);
 274
 275	is_directory = S_ISDIR(inode->i_mode);
 276
 277	/* Do this BEFORE marking the inode not in use or returning an error */
 278	ext4_clear_inode(inode);
 279
 280	es = EXT4_SB(sb)->s_es;
 281	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
 282		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
 283		goto error_return;
 284	}
 285	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 286	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 287	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 288	/* Don't bother if the inode bitmap is corrupt. */
 289	grp = ext4_get_group_info(sb, block_group);
 290	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
 291		goto error_return;
 292
 293	BUFFER_TRACE(bitmap_bh, "get_write_access");
 294	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
 295	if (fatal)
 296		goto error_return;
 297
 298	fatal = -ESRCH;
 299	gdp = ext4_get_group_desc(sb, block_group, &bh2);
 300	if (gdp) {
 301		BUFFER_TRACE(bh2, "get_write_access");
 302		fatal = ext4_journal_get_write_access(handle, bh2);
 303	}
 304	ext4_lock_group(sb, block_group);
 305	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
 306	if (fatal || !cleared) {
 307		ext4_unlock_group(sb, block_group);
 308		goto out;
 309	}
 310
 311	count = ext4_free_inodes_count(sb, gdp) + 1;
 312	ext4_free_inodes_set(sb, gdp, count);
 313	if (is_directory) {
 314		count = ext4_used_dirs_count(sb, gdp) - 1;
 315		ext4_used_dirs_set(sb, gdp, count);
 316		percpu_counter_dec(&sbi->s_dirs_counter);
 317	}
 318	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
 319				   EXT4_INODES_PER_GROUP(sb) / 8);
 320	ext4_group_desc_csum_set(sb, block_group, gdp);
 321	ext4_unlock_group(sb, block_group);
 322
 323	percpu_counter_inc(&sbi->s_freeinodes_counter);
 324	if (sbi->s_log_groups_per_flex) {
 325		ext4_group_t f = ext4_flex_group(sbi, block_group);
 326
 327		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
 328		if (is_directory)
 329			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
 330	}
 331	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 332	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
 333out:
 334	if (cleared) {
 335		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
 336		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 337		if (!fatal)
 338			fatal = err;
 339	} else {
 340		ext4_error(sb, "bit already cleared for inode %lu", ino);
 341		if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
 342			int count;
 343			count = ext4_free_inodes_count(sb, gdp);
 344			percpu_counter_sub(&sbi->s_freeinodes_counter,
 345					   count);
 346		}
 347		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
 348	}
 349
 350error_return:
 351	brelse(bitmap_bh);
 352	ext4_std_error(sb, fatal);
 353}
 354
 355struct orlov_stats {
 356	__u64 free_clusters;
 357	__u32 free_inodes;
 358	__u32 used_dirs;
 359};
 360
 361/*
 362 * Helper function for Orlov's allocator; returns critical information
 363 * for a particular block group or flex_bg.  If flex_size is 1, then g
 364 * is a block group number; otherwise it is flex_bg number.
 365 */
 366static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 367			    int flex_size, struct orlov_stats *stats)
 368{
 369	struct ext4_group_desc *desc;
 370	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 371
 372	if (flex_size > 1) {
 373		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
 374		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
 375		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
 376		return;
 377	}
 378
 379	desc = ext4_get_group_desc(sb, g, NULL);
 380	if (desc) {
 381		stats->free_inodes = ext4_free_inodes_count(sb, desc);
 382		stats->free_clusters = ext4_free_group_clusters(sb, desc);
 383		stats->used_dirs = ext4_used_dirs_count(sb, desc);
 384	} else {
 385		stats->free_inodes = 0;
 386		stats->free_clusters = 0;
 387		stats->used_dirs = 0;
 388	}
 389}
 390
 391/*
 392 * Orlov's allocator for directories.
 393 *
 394 * We always try to spread first-level directories.
 395 *
 396 * If there are blockgroups with both free inodes and free blocks counts
 397 * not worse than average we return one with smallest directory count.
 398 * Otherwise we simply return a random group.
 399 *
 400 * For the rest rules look so:
 401 *
 402 * It's OK to put directory into a group unless
 403 * it has too many directories already (max_dirs) or
 404 * it has too few free inodes left (min_inodes) or
 405 * it has too few free blocks left (min_blocks) or
 406 * Parent's group is preferred, if it doesn't satisfy these
 407 * conditions we search cyclically through the rest. If none
 408 * of the groups look good we just look for a group with more
 409 * free inodes than average (starting at parent's group).
 410 */
 411
 412static int find_group_orlov(struct super_block *sb, struct inode *parent,
 413			    ext4_group_t *group, umode_t mode,
 414			    const struct qstr *qstr)
 415{
 416	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 417	struct ext4_sb_info *sbi = EXT4_SB(sb);
 418	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 419	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 420	unsigned int freei, avefreei, grp_free;
 421	ext4_fsblk_t freeb, avefreec;
 422	unsigned int ndirs;
 423	int max_dirs, min_inodes;
 424	ext4_grpblk_t min_clusters;
 425	ext4_group_t i, grp, g, ngroups;
 426	struct ext4_group_desc *desc;
 427	struct orlov_stats stats;
 428	int flex_size = ext4_flex_bg_size(sbi);
 429	struct dx_hash_info hinfo;
 430
 431	ngroups = real_ngroups;
 432	if (flex_size > 1) {
 433		ngroups = (real_ngroups + flex_size - 1) >>
 434			sbi->s_log_groups_per_flex;
 435		parent_group >>= sbi->s_log_groups_per_flex;
 436	}
 437
 438	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 439	avefreei = freei / ngroups;
 440	freeb = EXT4_C2B(sbi,
 441		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
 442	avefreec = freeb;
 443	do_div(avefreec, ngroups);
 444	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 445
 446	if (S_ISDIR(mode) &&
 447	    ((parent == sb->s_root->d_inode) ||
 448	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 449		int best_ndir = inodes_per_group;
 450		int ret = -1;
 451
 452		if (qstr) {
 453			hinfo.hash_version = DX_HASH_HALF_MD4;
 454			hinfo.seed = sbi->s_hash_seed;
 455			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
 456			grp = hinfo.hash;
 457		} else
 458			grp = prandom_u32();
 459		parent_group = (unsigned)grp % ngroups;
 460		for (i = 0; i < ngroups; i++) {
 461			g = (parent_group + i) % ngroups;
 462			get_orlov_stats(sb, g, flex_size, &stats);
 463			if (!stats.free_inodes)
 464				continue;
 465			if (stats.used_dirs >= best_ndir)
 466				continue;
 467			if (stats.free_inodes < avefreei)
 468				continue;
 469			if (stats.free_clusters < avefreec)
 470				continue;
 471			grp = g;
 472			ret = 0;
 473			best_ndir = stats.used_dirs;
 474		}
 475		if (ret)
 476			goto fallback;
 477	found_flex_bg:
 478		if (flex_size == 1) {
 479			*group = grp;
 480			return 0;
 481		}
 482
 483		/*
 484		 * We pack inodes at the beginning of the flexgroup's
 485		 * inode tables.  Block allocation decisions will do
 486		 * something similar, although regular files will
 487		 * start at 2nd block group of the flexgroup.  See
 488		 * ext4_ext_find_goal() and ext4_find_near().
 489		 */
 490		grp *= flex_size;
 491		for (i = 0; i < flex_size; i++) {
 492			if (grp+i >= real_ngroups)
 493				break;
 494			desc = ext4_get_group_desc(sb, grp+i, NULL);
 495			if (desc && ext4_free_inodes_count(sb, desc)) {
 496				*group = grp+i;
 497				return 0;
 498			}
 499		}
 500		goto fallback;
 501	}
 502
 503	max_dirs = ndirs / ngroups + inodes_per_group / 16;
 504	min_inodes = avefreei - inodes_per_group*flex_size / 4;
 505	if (min_inodes < 1)
 506		min_inodes = 1;
 507	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
 508
 509	/*
 510	 * Start looking in the flex group where we last allocated an
 511	 * inode for this parent directory
 512	 */
 513	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
 514		parent_group = EXT4_I(parent)->i_last_alloc_group;
 515		if (flex_size > 1)
 516			parent_group >>= sbi->s_log_groups_per_flex;
 517	}
 518
 519	for (i = 0; i < ngroups; i++) {
 520		grp = (parent_group + i) % ngroups;
 521		get_orlov_stats(sb, grp, flex_size, &stats);
 522		if (stats.used_dirs >= max_dirs)
 523			continue;
 524		if (stats.free_inodes < min_inodes)
 525			continue;
 526		if (stats.free_clusters < min_clusters)
 527			continue;
 528		goto found_flex_bg;
 529	}
 530
 531fallback:
 532	ngroups = real_ngroups;
 533	avefreei = freei / ngroups;
 534fallback_retry:
 535	parent_group = EXT4_I(parent)->i_block_group;
 536	for (i = 0; i < ngroups; i++) {
 537		grp = (parent_group + i) % ngroups;
 538		desc = ext4_get_group_desc(sb, grp, NULL);
 539		if (desc) {
 540			grp_free = ext4_free_inodes_count(sb, desc);
 541			if (grp_free && grp_free >= avefreei) {
 542				*group = grp;
 543				return 0;
 544			}
 545		}
 546	}
 547
 548	if (avefreei) {
 549		/*
 550		 * The free-inodes counter is approximate, and for really small
 551		 * filesystems the above test can fail to find any blockgroups
 552		 */
 553		avefreei = 0;
 554		goto fallback_retry;
 555	}
 556
 557	return -1;
 558}
 559
 560static int find_group_other(struct super_block *sb, struct inode *parent,
 561			    ext4_group_t *group, umode_t mode)
 562{
 563	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 564	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 565	struct ext4_group_desc *desc;
 566	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 567
 568	/*
 569	 * Try to place the inode is the same flex group as its
 570	 * parent.  If we can't find space, use the Orlov algorithm to
 571	 * find another flex group, and store that information in the
 572	 * parent directory's inode information so that use that flex
 573	 * group for future allocations.
 574	 */
 575	if (flex_size > 1) {
 576		int retry = 0;
 577
 578	try_again:
 579		parent_group &= ~(flex_size-1);
 580		last = parent_group + flex_size;
 581		if (last > ngroups)
 582			last = ngroups;
 583		for  (i = parent_group; i < last; i++) {
 584			desc = ext4_get_group_desc(sb, i, NULL);
 585			if (desc && ext4_free_inodes_count(sb, desc)) {
 586				*group = i;
 587				return 0;
 588			}
 589		}
 590		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
 591			retry = 1;
 592			parent_group = EXT4_I(parent)->i_last_alloc_group;
 593			goto try_again;
 594		}
 595		/*
 596		 * If this didn't work, use the Orlov search algorithm
 597		 * to find a new flex group; we pass in the mode to
 598		 * avoid the topdir algorithms.
 599		 */
 600		*group = parent_group + flex_size;
 601		if (*group > ngroups)
 602			*group = 0;
 603		return find_group_orlov(sb, parent, group, mode, NULL);
 604	}
 605
 606	/*
 607	 * Try to place the inode in its parent directory
 608	 */
 609	*group = parent_group;
 610	desc = ext4_get_group_desc(sb, *group, NULL);
 611	if (desc && ext4_free_inodes_count(sb, desc) &&
 612	    ext4_free_group_clusters(sb, desc))
 613		return 0;
 614
 615	/*
 616	 * We're going to place this inode in a different blockgroup from its
 617	 * parent.  We want to cause files in a common directory to all land in
 618	 * the same blockgroup.  But we want files which are in a different
 619	 * directory which shares a blockgroup with our parent to land in a
 620	 * different blockgroup.
 621	 *
 622	 * So add our directory's i_ino into the starting point for the hash.
 623	 */
 624	*group = (*group + parent->i_ino) % ngroups;
 625
 626	/*
 627	 * Use a quadratic hash to find a group with a free inode and some free
 628	 * blocks.
 629	 */
 630	for (i = 1; i < ngroups; i <<= 1) {
 631		*group += i;
 632		if (*group >= ngroups)
 633			*group -= ngroups;
 634		desc = ext4_get_group_desc(sb, *group, NULL);
 635		if (desc && ext4_free_inodes_count(sb, desc) &&
 636		    ext4_free_group_clusters(sb, desc))
 637			return 0;
 638	}
 639
 640	/*
 641	 * That failed: try linear search for a free inode, even if that group
 642	 * has no free blocks.
 643	 */
 644	*group = parent_group;
 645	for (i = 0; i < ngroups; i++) {
 646		if (++*group >= ngroups)
 647			*group = 0;
 648		desc = ext4_get_group_desc(sb, *group, NULL);
 649		if (desc && ext4_free_inodes_count(sb, desc))
 650			return 0;
 651	}
 652
 653	return -1;
 654}
 655
 656/*
 657 * In no journal mode, if an inode has recently been deleted, we want
 658 * to avoid reusing it until we're reasonably sure the inode table
 659 * block has been written back to disk.  (Yes, these values are
 660 * somewhat arbitrary...)
 661 */
 662#define RECENTCY_MIN	5
 663#define RECENTCY_DIRTY	30
 664
 665static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 666{
 667	struct ext4_group_desc	*gdp;
 668	struct ext4_inode	*raw_inode;
 669	struct buffer_head	*bh;
 670	unsigned long		dtime, now;
 671	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 672	int	offset, ret = 0, recentcy = RECENTCY_MIN;
 673
 674	gdp = ext4_get_group_desc(sb, group, NULL);
 675	if (unlikely(!gdp))
 676		return 0;
 677
 678	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
 679		       (ino / inodes_per_block));
 680	if (unlikely(!bh) || !buffer_uptodate(bh))
 681		/*
 682		 * If the block is not in the buffer cache, then it
 683		 * must have been written out.
 684		 */
 685		goto out;
 686
 687	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
 688	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
 689	dtime = le32_to_cpu(raw_inode->i_dtime);
 690	now = get_seconds();
 691	if (buffer_dirty(bh))
 692		recentcy += RECENTCY_DIRTY;
 693
 694	if (dtime && (dtime < now) && (now < dtime + recentcy))
 695		ret = 1;
 696out:
 697	brelse(bh);
 698	return ret;
 699}
 700
 701/*
 702 * There are two policies for allocating an inode.  If the new inode is
 703 * a directory, then a forward search is made for a block group with both
 704 * free space and a low directory-to-inode ratio; if that fails, then of
 705 * the groups with above-average free space, that group with the fewest
 706 * directories already is chosen.
 707 *
 708 * For other inodes, search forward from the parent directory's block
 709 * group to find a free inode.
 710 */
 711struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 712			       umode_t mode, const struct qstr *qstr,
 713			       __u32 goal, uid_t *owner, int handle_type,
 714			       unsigned int line_no, int nblocks)
 715{
 716	struct super_block *sb;
 717	struct buffer_head *inode_bitmap_bh = NULL;
 718	struct buffer_head *group_desc_bh;
 719	ext4_group_t ngroups, group = 0;
 720	unsigned long ino = 0;
 721	struct inode *inode;
 722	struct ext4_group_desc *gdp = NULL;
 723	struct ext4_inode_info *ei;
 724	struct ext4_sb_info *sbi;
 725	int ret2, err = 0;
 726	struct inode *ret;
 727	ext4_group_t i;
 728	ext4_group_t flex_group;
 729	struct ext4_group_info *grp;
 730
 731	/* Cannot create files in a deleted directory */
 732	if (!dir || !dir->i_nlink)
 733		return ERR_PTR(-EPERM);
 734
 735	sb = dir->i_sb;
 736	ngroups = ext4_get_groups_count(sb);
 737	trace_ext4_request_inode(dir, mode);
 738	inode = new_inode(sb);
 739	if (!inode)
 740		return ERR_PTR(-ENOMEM);
 741	ei = EXT4_I(inode);
 742	sbi = EXT4_SB(sb);
 743
 744	/*
 745	 * Initalize owners and quota early so that we don't have to account
 746	 * for quota initialization worst case in standard inode creating
 747	 * transaction
 748	 */
 749	if (owner) {
 750		inode->i_mode = mode;
 751		i_uid_write(inode, owner[0]);
 752		i_gid_write(inode, owner[1]);
 753	} else if (test_opt(sb, GRPID)) {
 754		inode->i_mode = mode;
 755		inode->i_uid = current_fsuid();
 756		inode->i_gid = dir->i_gid;
 757	} else
 758		inode_init_owner(inode, dir, mode);
 759	dquot_initialize(inode);
 760
 761	if (!goal)
 762		goal = sbi->s_inode_goal;
 763
 764	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
 765		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
 766		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
 767		ret2 = 0;
 768		goto got_group;
 769	}
 770
 771	if (S_ISDIR(mode))
 772		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
 773	else
 774		ret2 = find_group_other(sb, dir, &group, mode);
 775
 776got_group:
 777	EXT4_I(dir)->i_last_alloc_group = group;
 778	err = -ENOSPC;
 779	if (ret2 == -1)
 780		goto out;
 781
 782	/*
 783	 * Normally we will only go through one pass of this loop,
 784	 * unless we get unlucky and it turns out the group we selected
 785	 * had its last inode grabbed by someone else.
 786	 */
 787	for (i = 0; i < ngroups; i++, ino = 0) {
 788		err = -EIO;
 789
 790		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 791		if (!gdp)
 792			goto out;
 793
 794		/*
 795		 * Check free inodes count before loading bitmap.
 796		 */
 797		if (ext4_free_inodes_count(sb, gdp) == 0) {
 798			if (++group == ngroups)
 799				group = 0;
 800			continue;
 801		}
 802
 803		grp = ext4_get_group_info(sb, group);
 804		/* Skip groups with already-known suspicious inode tables */
 805		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
 806			if (++group == ngroups)
 807				group = 0;
 808			continue;
 809		}
 810
 811		brelse(inode_bitmap_bh);
 812		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 813		/* Skip groups with suspicious inode tables */
 814		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
 815			if (++group == ngroups)
 816				group = 0;
 817			continue;
 818		}
 819
 820repeat_in_this_group:
 821		ino = ext4_find_next_zero_bit((unsigned long *)
 822					      inode_bitmap_bh->b_data,
 823					      EXT4_INODES_PER_GROUP(sb), ino);
 824		if (ino >= EXT4_INODES_PER_GROUP(sb))
 825			goto next_group;
 826		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
 827			ext4_error(sb, "reserved inode found cleared - "
 828				   "inode=%lu", ino + 1);
 829			continue;
 830		}
 831		if ((EXT4_SB(sb)->s_journal == NULL) &&
 832		    recently_deleted(sb, group, ino)) {
 833			ino++;
 834			goto next_inode;
 835		}
 836		if (!handle) {
 837			BUG_ON(nblocks <= 0);
 838			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
 839							 handle_type, nblocks,
 840							 0);
 841			if (IS_ERR(handle)) {
 842				err = PTR_ERR(handle);
 843				ext4_std_error(sb, err);
 844				goto out;
 845			}
 846		}
 847		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 848		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
 849		if (err) {
 850			ext4_std_error(sb, err);
 851			goto out;
 852		}
 853		ext4_lock_group(sb, group);
 854		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
 855		ext4_unlock_group(sb, group);
 856		ino++;		/* the inode bitmap is zero-based */
 857		if (!ret2)
 858			goto got; /* we grabbed the inode! */
 859next_inode:
 860		if (ino < EXT4_INODES_PER_GROUP(sb))
 861			goto repeat_in_this_group;
 862next_group:
 863		if (++group == ngroups)
 864			group = 0;
 865	}
 866	err = -ENOSPC;
 867	goto out;
 868
 869got:
 870	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
 871	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
 872	if (err) {
 873		ext4_std_error(sb, err);
 874		goto out;
 875	}
 876
 877	BUFFER_TRACE(group_desc_bh, "get_write_access");
 878	err = ext4_journal_get_write_access(handle, group_desc_bh);
 879	if (err) {
 880		ext4_std_error(sb, err);
 881		goto out;
 882	}
 883
 884	/* We may have to initialize the block bitmap if it isn't already */
 885	if (ext4_has_group_desc_csum(sb) &&
 886	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 887		struct buffer_head *block_bitmap_bh;
 888
 889		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
 890		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
 891		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 892		if (err) {
 893			brelse(block_bitmap_bh);
 894			ext4_std_error(sb, err);
 895			goto out;
 896		}
 897
 898		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
 899		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
 900
 901		/* recheck and clear flag under lock if we still need to */
 902		ext4_lock_group(sb, group);
 903		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 904			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 905			ext4_free_group_clusters_set(sb, gdp,
 906				ext4_free_clusters_after_init(sb, group, gdp));
 907			ext4_block_bitmap_csum_set(sb, group, gdp,
 908						   block_bitmap_bh);
 909			ext4_group_desc_csum_set(sb, group, gdp);
 910		}
 911		ext4_unlock_group(sb, group);
 912		brelse(block_bitmap_bh);
 913
 914		if (err) {
 915			ext4_std_error(sb, err);
 916			goto out;
 917		}
 918	}
 919
 920	/* Update the relevant bg descriptor fields */
 921	if (ext4_has_group_desc_csum(sb)) {
 922		int free;
 923		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 924
 925		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
 926		ext4_lock_group(sb, group); /* while we modify the bg desc */
 927		free = EXT4_INODES_PER_GROUP(sb) -
 928			ext4_itable_unused_count(sb, gdp);
 929		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 930			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 931			free = 0;
 932		}
 933		/*
 934		 * Check the relative inode number against the last used
 935		 * relative inode number in this group. if it is greater
 936		 * we need to update the bg_itable_unused count
 937		 */
 938		if (ino > free)
 939			ext4_itable_unused_set(sb, gdp,
 940					(EXT4_INODES_PER_GROUP(sb) - ino));
 941		up_read(&grp->alloc_sem);
 942	} else {
 943		ext4_lock_group(sb, group);
 944	}
 945
 946	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
 947	if (S_ISDIR(mode)) {
 948		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
 949		if (sbi->s_log_groups_per_flex) {
 950			ext4_group_t f = ext4_flex_group(sbi, group);
 951
 952			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 953		}
 954	}
 955	if (ext4_has_group_desc_csum(sb)) {
 956		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
 957					   EXT4_INODES_PER_GROUP(sb) / 8);
 958		ext4_group_desc_csum_set(sb, group, gdp);
 959	}
 960	ext4_unlock_group(sb, group);
 961
 962	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 963	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
 964	if (err) {
 965		ext4_std_error(sb, err);
 966		goto out;
 967	}
 968
 969	percpu_counter_dec(&sbi->s_freeinodes_counter);
 970	if (S_ISDIR(mode))
 971		percpu_counter_inc(&sbi->s_dirs_counter);
 972
 973	if (sbi->s_log_groups_per_flex) {
 974		flex_group = ext4_flex_group(sbi, group);
 975		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 976	}
 977
 978	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 979	/* This is the optimal IO size (for stat), not the fs block size */
 980	inode->i_blocks = 0;
 981	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
 982						       ext4_current_time(inode);
 983
 984	memset(ei->i_data, 0, sizeof(ei->i_data));
 985	ei->i_dir_start_lookup = 0;
 986	ei->i_disksize = 0;
 987
 988	/* Don't inherit extent flag from directory, amongst others. */
 989	ei->i_flags =
 990		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 991	ei->i_file_acl = 0;
 992	ei->i_dtime = 0;
 993	ei->i_block_group = group;
 994	ei->i_last_alloc_group = ~0;
 995
 996	ext4_set_inode_flags(inode);
 997	if (IS_DIRSYNC(inode))
 998		ext4_handle_sync(handle);
 999	if (insert_inode_locked(inode) < 0) {
1000		/*
1001		 * Likely a bitmap corruption causing inode to be allocated
1002		 * twice.
1003		 */
1004		err = -EIO;
1005		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
1006			   inode->i_ino);
1007		goto out;
1008	}
1009	spin_lock(&sbi->s_next_gen_lock);
1010	inode->i_generation = sbi->s_next_generation++;
1011	spin_unlock(&sbi->s_next_gen_lock);
1012
1013	/* Precompute checksum seed for inode metadata */
1014	if (ext4_has_metadata_csum(sb)) {
1015		__u32 csum;
1016		__le32 inum = cpu_to_le32(inode->i_ino);
1017		__le32 gen = cpu_to_le32(inode->i_generation);
1018		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
1019				   sizeof(inum));
1020		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
1021					      sizeof(gen));
1022	}
1023
1024	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1025	ext4_set_inode_state(inode, EXT4_STATE_NEW);
1026
1027	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1028
1029	ei->i_inline_off = 0;
1030	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
1031		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1032
1033	ret = inode;
1034	err = dquot_alloc_inode(inode);
1035	if (err)
1036		goto fail_drop;
1037
1038	err = ext4_init_acl(handle, inode, dir);
1039	if (err)
1040		goto fail_free_drop;
1041
1042	err = ext4_init_security(handle, inode, dir, qstr);
1043	if (err)
1044		goto fail_free_drop;
1045
1046	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1047		/* set extent flag only for directory, file and normal symlink*/
1048		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1049			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1050			ext4_ext_tree_init(handle, inode);
1051		}
1052	}
1053
1054	if (ext4_handle_valid(handle)) {
1055		ei->i_sync_tid = handle->h_transaction->t_tid;
1056		ei->i_datasync_tid = handle->h_transaction->t_tid;
1057	}
1058
1059	err = ext4_mark_inode_dirty(handle, inode);
1060	if (err) {
1061		ext4_std_error(sb, err);
1062		goto fail_free_drop;
1063	}
1064
1065	ext4_debug("allocating inode %lu\n", inode->i_ino);
1066	trace_ext4_allocate_inode(inode, dir, mode);
1067	brelse(inode_bitmap_bh);
1068	return ret;
1069
1070fail_free_drop:
1071	dquot_free_inode(inode);
1072fail_drop:
1073	clear_nlink(inode);
1074	unlock_new_inode(inode);
1075out:
1076	dquot_drop(inode);
1077	inode->i_flags |= S_NOQUOTA;
1078	iput(inode);
1079	brelse(inode_bitmap_bh);
1080	return ERR_PTR(err);
1081}
1082
1083/* Verify that we are loading a valid orphan from disk */
1084struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1085{
1086	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
1087	ext4_group_t block_group;
1088	int bit;
1089	struct buffer_head *bitmap_bh;
1090	struct inode *inode = NULL;
1091	long err = -EIO;
1092
1093	/* Error cases - e2fsck has already cleaned up for us */
1094	if (ino > max_ino) {
1095		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
1096		goto error;
1097	}
1098
1099	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
1100	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1101	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1102	if (!bitmap_bh) {
1103		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1104		goto error;
1105	}
1106
1107	/* Having the inode bit set should be a 100% indicator that this
1108	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
1109	 * inodes that were being truncated, so we can't check i_nlink==0.
1110	 */
1111	if (!ext4_test_bit(bit, bitmap_bh->b_data))
1112		goto bad_orphan;
1113
1114	inode = ext4_iget(sb, ino);
1115	if (IS_ERR(inode))
1116		goto iget_failed;
1117
1118	/*
1119	 * If the orphans has i_nlinks > 0 then it should be able to be
1120	 * truncated, otherwise it won't be removed from the orphan list
1121	 * during processing and an infinite loop will result.
1122	 */
1123	if (inode->i_nlink && !ext4_can_truncate(inode))
1124		goto bad_orphan;
1125
1126	if (NEXT_ORPHAN(inode) > max_ino)
1127		goto bad_orphan;
1128	brelse(bitmap_bh);
1129	return inode;
1130
1131iget_failed:
1132	err = PTR_ERR(inode);
1133	inode = NULL;
1134bad_orphan:
1135	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
1136	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1137	       bit, (unsigned long long)bitmap_bh->b_blocknr,
1138	       ext4_test_bit(bit, bitmap_bh->b_data));
1139	printk(KERN_WARNING "inode=%p\n", inode);
1140	if (inode) {
1141		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
1142		       is_bad_inode(inode));
1143		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
1144		       NEXT_ORPHAN(inode));
1145		printk(KERN_WARNING "max_ino=%lu\n", max_ino);
1146		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
1147		/* Avoid freeing blocks if we got a bad deleted inode */
1148		if (inode->i_nlink == 0)
1149			inode->i_blocks = 0;
1150		iput(inode);
1151	}
1152	brelse(bitmap_bh);
1153error:
1154	return ERR_PTR(err);
1155}
1156
1157unsigned long ext4_count_free_inodes(struct super_block *sb)
1158{
1159	unsigned long desc_count;
1160	struct ext4_group_desc *gdp;
1161	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1162#ifdef EXT4FS_DEBUG
1163	struct ext4_super_block *es;
1164	unsigned long bitmap_count, x;
1165	struct buffer_head *bitmap_bh = NULL;
1166
1167	es = EXT4_SB(sb)->s_es;
1168	desc_count = 0;
1169	bitmap_count = 0;
1170	gdp = NULL;
1171	for (i = 0; i < ngroups; i++) {
1172		gdp = ext4_get_group_desc(sb, i, NULL);
1173		if (!gdp)
1174			continue;
1175		desc_count += ext4_free_inodes_count(sb, gdp);
1176		brelse(bitmap_bh);
1177		bitmap_bh = ext4_read_inode_bitmap(sb, i);
1178		if (!bitmap_bh)
1179			continue;
1180
1181		x = ext4_count_free(bitmap_bh->b_data,
1182				    EXT4_INODES_PER_GROUP(sb) / 8);
1183		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1184			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1185		bitmap_count += x;
1186	}
1187	brelse(bitmap_bh);
1188	printk(KERN_DEBUG "ext4_count_free_inodes: "
1189	       "stored = %u, computed = %lu, %lu\n",
1190	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
1191	return desc_count;
1192#else
1193	desc_count = 0;
1194	for (i = 0; i < ngroups; i++) {
1195		gdp = ext4_get_group_desc(sb, i, NULL);
1196		if (!gdp)
1197			continue;
1198		desc_count += ext4_free_inodes_count(sb, gdp);
1199		cond_resched();
1200	}
1201	return desc_count;
1202#endif
1203}
1204
1205/* Called at mount-time, super-block is locked */
1206unsigned long ext4_count_dirs(struct super_block * sb)
1207{
1208	unsigned long count = 0;
1209	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1210
1211	for (i = 0; i < ngroups; i++) {
1212		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1213		if (!gdp)
1214			continue;
1215		count += ext4_used_dirs_count(sb, gdp);
1216	}
1217	return count;
1218}
1219
1220/*
1221 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1222 * inode table. Must be called without any spinlock held. The only place
1223 * where it is called from on active part of filesystem is ext4lazyinit
1224 * thread, so we do not need any special locks, however we have to prevent
1225 * inode allocation from the current group, so we take alloc_sem lock, to
1226 * block ext4_new_inode() until we are finished.
1227 */
1228int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1229				 int barrier)
1230{
1231	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1232	struct ext4_sb_info *sbi = EXT4_SB(sb);
1233	struct ext4_group_desc *gdp = NULL;
1234	struct buffer_head *group_desc_bh;
1235	handle_t *handle;
1236	ext4_fsblk_t blk;
1237	int num, ret = 0, used_blks = 0;
1238
1239	/* This should not happen, but just to be sure check this */
1240	if (sb->s_flags & MS_RDONLY) {
1241		ret = 1;
1242		goto out;
1243	}
1244
1245	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1246	if (!gdp)
1247		goto out;
1248
1249	/*
1250	 * We do not need to lock this, because we are the only one
1251	 * handling this flag.
1252	 */
1253	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1254		goto out;
1255
1256	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
1257	if (IS_ERR(handle)) {
1258		ret = PTR_ERR(handle);
1259		goto out;
1260	}
1261
1262	down_write(&grp->alloc_sem);
1263	/*
1264	 * If inode bitmap was already initialized there may be some
1265	 * used inodes so we need to skip blocks with used inodes in
1266	 * inode table.
1267	 */
1268	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1269		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1270			    ext4_itable_unused_count(sb, gdp)),
1271			    sbi->s_inodes_per_block);
1272
1273	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1274		ext4_error(sb, "Something is wrong with group %u: "
1275			   "used itable blocks: %d; "
1276			   "itable unused count: %u",
1277			   group, used_blks,
1278			   ext4_itable_unused_count(sb, gdp));
1279		ret = 1;
1280		goto err_out;
1281	}
1282
1283	blk = ext4_inode_table(sb, gdp) + used_blks;
1284	num = sbi->s_itb_per_group - used_blks;
1285
1286	BUFFER_TRACE(group_desc_bh, "get_write_access");
1287	ret = ext4_journal_get_write_access(handle,
1288					    group_desc_bh);
1289	if (ret)
1290		goto err_out;
1291
1292	/*
1293	 * Skip zeroout if the inode table is full. But we set the ZEROED
1294	 * flag anyway, because obviously, when it is full it does not need
1295	 * further zeroing.
1296	 */
1297	if (unlikely(num == 0))
1298		goto skip_zeroout;
1299
1300	ext4_debug("going to zero out inode table in group %d\n",
1301		   group);
1302	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1303	if (ret < 0)
1304		goto err_out;
1305	if (barrier)
1306		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
1307
1308skip_zeroout:
1309	ext4_lock_group(sb, group);
1310	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1311	ext4_group_desc_csum_set(sb, group, gdp);
1312	ext4_unlock_group(sb, group);
1313
1314	BUFFER_TRACE(group_desc_bh,
1315		     "call ext4_handle_dirty_metadata");
1316	ret = ext4_handle_dirty_metadata(handle, NULL,
1317					 group_desc_bh);
1318
1319err_out:
1320	up_write(&grp->alloc_sem);
1321	ext4_journal_stop(handle);
1322out:
1323	return ret;
1324}