fs/gfs2/super.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / gfs2 / super.c
at master 39 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4 * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
   5 */
   6
   7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   8
   9#include <linux/bio.h>
  10#include <linux/sched/signal.h>
  11#include <linux/slab.h>
  12#include <linux/spinlock.h>
  13#include <linux/completion.h>
  14#include <linux/buffer_head.h>
  15#include <linux/statfs.h>
  16#include <linux/seq_file.h>
  17#include <linux/mount.h>
  18#include <linux/kthread.h>
  19#include <linux/delay.h>
  20#include <linux/gfs2_ondisk.h>
  21#include <linux/crc32.h>
  22#include <linux/time.h>
  23#include <linux/wait.h>
  24#include <linux/writeback.h>
  25#include <linux/backing-dev.h>
  26#include <linux/kernel.h>
  27
  28#include "gfs2.h"
  29#include "incore.h"
  30#include "bmap.h"
  31#include "dir.h"
  32#include "glock.h"
  33#include "glops.h"
  34#include "inode.h"
  35#include "log.h"
  36#include "meta_io.h"
  37#include "quota.h"
  38#include "recovery.h"
  39#include "rgrp.h"
  40#include "super.h"
  41#include "trans.h"
  42#include "util.h"
  43#include "sys.h"
  44#include "xattr.h"
  45#include "lops.h"
  46
  47enum evict_behavior {
  48	EVICT_SHOULD_DELETE,
  49	EVICT_SHOULD_SKIP_DELETE,
  50	EVICT_SHOULD_DEFER_DELETE,
  51};
  52
  53/**
  54 * gfs2_jindex_free - Clear all the journal index information
  55 * @sdp: The GFS2 superblock
  56 *
  57 */
  58
  59void gfs2_jindex_free(struct gfs2_sbd *sdp)
  60{
  61	struct list_head list;
  62	struct gfs2_jdesc *jd;
  63
  64	spin_lock(&sdp->sd_jindex_spin);
  65	list_add(&list, &sdp->sd_jindex_list);
  66	list_del_init(&sdp->sd_jindex_list);
  67	sdp->sd_journals = 0;
  68	spin_unlock(&sdp->sd_jindex_spin);
  69
  70	down_write(&sdp->sd_log_flush_lock);
  71	sdp->sd_jdesc = NULL;
  72	up_write(&sdp->sd_log_flush_lock);
  73
  74	while (!list_empty(&list)) {
  75		jd = list_first_entry(&list, struct gfs2_jdesc, jd_list);
  76		BUG_ON(jd->jd_log_bio);
  77		gfs2_free_journal_extents(jd);
  78		list_del(&jd->jd_list);
  79		iput(jd->jd_inode);
  80		jd->jd_inode = NULL;
  81		kfree(jd);
  82	}
  83}
  84
  85static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
  86{
  87	struct gfs2_jdesc *jd;
  88
  89	list_for_each_entry(jd, head, jd_list) {
  90		if (jd->jd_jid == jid)
  91			return jd;
  92	}
  93	return NULL;
  94}
  95
  96struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
  97{
  98	struct gfs2_jdesc *jd;
  99
 100	spin_lock(&sdp->sd_jindex_spin);
 101	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
 102	spin_unlock(&sdp->sd_jindex_spin);
 103
 104	return jd;
 105}
 106
 107int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 108{
 109	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 110	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 111	u64 size = i_size_read(jd->jd_inode);
 112
 113	if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30)))
 114		return -EIO;
 115
 116	jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
 117
 118	if (gfs2_write_alloc_required(ip, 0, size)) {
 119		gfs2_consist_inode(ip);
 120		return -EIO;
 121	}
 122
 123	return 0;
 124}
 125
 126/**
 127 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
 128 * @sdp: the filesystem
 129 *
 130 * Returns: errno
 131 */
 132
 133int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 134{
 135	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
 136	struct gfs2_glock *j_gl = ip->i_gl;
 137	int error;
 138
 139	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 140	if (gfs2_withdrawn(sdp))
 141		return -EIO;
 142
 143	if (sdp->sd_log_sequence == 0) {
 144		fs_err(sdp, "unknown status of our own journal jid %d",
 145		       sdp->sd_lockstruct.ls_jid);
 146		return -EIO;
 147	}
 148
 149	error = gfs2_quota_init(sdp);
 150	if (!error && gfs2_withdrawn(sdp))
 151		error = -EIO;
 152	if (!error)
 153		set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
 154	return error;
 155}
 156
 157void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 158{
 159	const struct gfs2_statfs_change *str = buf;
 160
 161	sc->sc_total = be64_to_cpu(str->sc_total);
 162	sc->sc_free = be64_to_cpu(str->sc_free);
 163	sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
 164}
 165
 166void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf)
 167{
 168	struct gfs2_statfs_change *str = buf;
 169
 170	str->sc_total = cpu_to_be64(sc->sc_total);
 171	str->sc_free = cpu_to_be64(sc->sc_free);
 172	str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
 173}
 174
 175int gfs2_statfs_init(struct gfs2_sbd *sdp)
 176{
 177	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 178	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 179	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 180	struct buffer_head *m_bh;
 181	struct gfs2_holder gh;
 182	int error;
 183
 184	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
 185				   &gh);
 186	if (error)
 187		return error;
 188
 189	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
 190	if (error)
 191		goto out;
 192
 193	if (sdp->sd_args.ar_spectator) {
 194		spin_lock(&sdp->sd_statfs_spin);
 195		gfs2_statfs_change_in(m_sc, m_bh->b_data +
 196				      sizeof(struct gfs2_dinode));
 197		spin_unlock(&sdp->sd_statfs_spin);
 198	} else {
 199		spin_lock(&sdp->sd_statfs_spin);
 200		gfs2_statfs_change_in(m_sc, m_bh->b_data +
 201				      sizeof(struct gfs2_dinode));
 202		gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data +
 203				      sizeof(struct gfs2_dinode));
 204		spin_unlock(&sdp->sd_statfs_spin);
 205
 206	}
 207
 208	brelse(m_bh);
 209out:
 210	gfs2_glock_dq_uninit(&gh);
 211	return 0;
 212}
 213
 214void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 215			s64 dinodes)
 216{
 217	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 218	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 219	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 220	s64 x, y;
 221	int need_sync = 0;
 222
 223	gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh);
 224
 225	spin_lock(&sdp->sd_statfs_spin);
 226	l_sc->sc_total += total;
 227	l_sc->sc_free += free;
 228	l_sc->sc_dinodes += dinodes;
 229	gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data +
 230			       sizeof(struct gfs2_dinode));
 231	if (sdp->sd_args.ar_statfs_percent) {
 232		x = 100 * l_sc->sc_free;
 233		y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
 234		if (x >= y || x <= -y)
 235			need_sync = 1;
 236	}
 237	spin_unlock(&sdp->sd_statfs_spin);
 238
 239	if (need_sync)
 240		gfs2_wake_up_statfs(sdp);
 241}
 242
 243void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh)
 244{
 245	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 246	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 247	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 248	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 249
 250	gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh);
 251	gfs2_trans_add_meta(m_ip->i_gl, m_bh);
 252
 253	spin_lock(&sdp->sd_statfs_spin);
 254	m_sc->sc_total += l_sc->sc_total;
 255	m_sc->sc_free += l_sc->sc_free;
 256	m_sc->sc_dinodes += l_sc->sc_dinodes;
 257	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
 258	memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode),
 259	       0, sizeof(struct gfs2_statfs_change));
 260	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 261	spin_unlock(&sdp->sd_statfs_spin);
 262}
 263
 264int gfs2_statfs_sync(struct super_block *sb, int type)
 265{
 266	struct gfs2_sbd *sdp = sb->s_fs_info;
 267	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 268	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 269	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 270	struct gfs2_holder gh;
 271	struct buffer_head *m_bh;
 272	int error;
 273
 274	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
 275				   &gh);
 276	if (error)
 277		goto out;
 278
 279	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
 280	if (error)
 281		goto out_unlock;
 282
 283	spin_lock(&sdp->sd_statfs_spin);
 284	gfs2_statfs_change_in(m_sc, m_bh->b_data +
 285			      sizeof(struct gfs2_dinode));
 286	if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
 287		spin_unlock(&sdp->sd_statfs_spin);
 288		goto out_bh;
 289	}
 290	spin_unlock(&sdp->sd_statfs_spin);
 291
 292	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
 293	if (error)
 294		goto out_bh;
 295
 296	update_statfs(sdp, m_bh);
 297	sdp->sd_statfs_force_sync = 0;
 298
 299	gfs2_trans_end(sdp);
 300
 301out_bh:
 302	brelse(m_bh);
 303out_unlock:
 304	gfs2_glock_dq_uninit(&gh);
 305out:
 306	return error;
 307}
 308
 309struct lfcc {
 310	struct list_head list;
 311	struct gfs2_holder gh;
 312};
 313
 314/**
 315 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
 316 *                            journals are clean
 317 * @sdp: the file system
 318 *
 319 * Returns: errno
 320 */
 321
 322static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
 323{
 324	struct gfs2_inode *ip;
 325	struct gfs2_jdesc *jd;
 326	struct lfcc *lfcc;
 327	LIST_HEAD(list);
 328	struct gfs2_log_header_host lh;
 329	int error, error2;
 330
 331	/*
 332	 * Grab all the journal glocks in SH mode.  We are *probably* doing
 333	 * that to prevent recovery.
 334	 */
 335
 336	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 337		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
 338		if (!lfcc) {
 339			error = -ENOMEM;
 340			goto out;
 341		}
 342		ip = GFS2_I(jd->jd_inode);
 343		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
 344		if (error) {
 345			kfree(lfcc);
 346			goto out;
 347		}
 348		list_add(&lfcc->list, &list);
 349	}
 350
 351	gfs2_freeze_unlock(sdp);
 352
 353	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
 354				   LM_FLAG_RECOVER | GL_NOPID,
 355				   &sdp->sd_freeze_gh);
 356	if (error)
 357		goto relock_shared;
 358
 359	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 360		error = gfs2_jdesc_check(jd);
 361		if (error)
 362			break;
 363		error = gfs2_find_jhead(jd, &lh);
 364		if (error)
 365			break;
 366		if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
 367			error = -EBUSY;
 368			break;
 369		}
 370	}
 371
 372	if (!error)
 373		goto out;  /* success */
 374
 375	gfs2_freeze_unlock(sdp);
 376
 377relock_shared:
 378	error2 = gfs2_freeze_lock_shared(sdp);
 379	gfs2_assert_withdraw(sdp, !error2);
 380
 381out:
 382	while (!list_empty(&list)) {
 383		lfcc = list_first_entry(&list, struct lfcc, list);
 384		list_del(&lfcc->list);
 385		gfs2_glock_dq_uninit(&lfcc->gh);
 386		kfree(lfcc);
 387	}
 388	return error;
 389}
 390
 391void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 392{
 393	const struct inode *inode = &ip->i_inode;
 394	struct gfs2_dinode *str = buf;
 395
 396	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 397	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
 398	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
 399	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
 400	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
 401	str->di_mode = cpu_to_be32(inode->i_mode);
 402	str->di_uid = cpu_to_be32(i_uid_read(inode));
 403	str->di_gid = cpu_to_be32(i_gid_read(inode));
 404	str->di_nlink = cpu_to_be32(inode->i_nlink);
 405	str->di_size = cpu_to_be64(i_size_read(inode));
 406	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
 407	str->di_atime = cpu_to_be64(inode_get_atime_sec(inode));
 408	str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode));
 409	str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode));
 410
 411	str->di_goal_meta = cpu_to_be64(ip->i_goal);
 412	str->di_goal_data = cpu_to_be64(ip->i_goal);
 413	str->di_generation = cpu_to_be64(ip->i_generation);
 414
 415	str->di_flags = cpu_to_be32(ip->i_diskflags);
 416	str->di_height = cpu_to_be16(ip->i_height);
 417	str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) &&
 418					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
 419					     GFS2_FORMAT_DE : 0);
 420	str->di_depth = cpu_to_be16(ip->i_depth);
 421	str->di_entries = cpu_to_be32(ip->i_entries);
 422
 423	str->di_eattr = cpu_to_be64(ip->i_eattr);
 424	str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode));
 425	str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode));
 426	str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode));
 427}
 428
 429/**
 430 * gfs2_write_inode - Make sure the inode is stable on the disk
 431 * @inode: The inode
 432 * @wbc: The writeback control structure
 433 *
 434 * Returns: errno
 435 */
 436
 437static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 438{
 439	struct gfs2_inode *ip = GFS2_I(inode);
 440	struct gfs2_sbd *sdp = GFS2_SB(inode);
 441	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
 442	struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
 443	int ret = 0;
 444	bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip));
 445
 446	if (flush_all)
 447		gfs2_log_flush(GFS2_SB(inode), ip->i_gl,
 448			       GFS2_LOG_HEAD_FLUSH_NORMAL |
 449			       GFS2_LFC_WRITE_INODE);
 450	if (bdi->wb.dirty_exceeded)
 451		gfs2_ail1_flush(sdp, wbc);
 452	else
 453		filemap_fdatawrite(metamapping);
 454	if (flush_all)
 455		ret = filemap_fdatawait(metamapping);
 456	if (ret)
 457		mark_inode_dirty_sync(inode);
 458	else {
 459		spin_lock(&inode->i_lock);
 460		if (!(inode->i_flags & I_DIRTY))
 461			gfs2_ordered_del_inode(ip);
 462		spin_unlock(&inode->i_lock);
 463	}
 464	return ret;
 465}
 466
 467/**
 468 * gfs2_dirty_inode - check for atime updates
 469 * @inode: The inode in question
 470 * @flags: The type of dirty
 471 *
 472 * Unfortunately it can be called under any combination of inode
 473 * glock and freeze glock, so we have to check carefully.
 474 *
 475 * At the moment this deals only with atime - it should be possible
 476 * to expand that role in future, once a review of the locking has
 477 * been carried out.
 478 */
 479
 480static void gfs2_dirty_inode(struct inode *inode, int flags)
 481{
 482	struct gfs2_inode *ip = GFS2_I(inode);
 483	struct gfs2_sbd *sdp = GFS2_SB(inode);
 484	struct buffer_head *bh;
 485	struct gfs2_holder gh;
 486	int need_unlock = 0;
 487	int need_endtrans = 0;
 488	int ret;
 489
 490	/* This can only happen during incomplete inode creation. */
 491	if (unlikely(!ip->i_gl))
 492		return;
 493
 494	if (gfs2_withdrawn(sdp))
 495		return;
 496	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
 497		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 498		if (ret) {
 499			fs_err(sdp, "dirty_inode: glock %d\n", ret);
 500			gfs2_dump_glock(NULL, ip->i_gl, true);
 501			return;
 502		}
 503		need_unlock = 1;
 504	} else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
 505		return;
 506
 507	if (current->journal_info == NULL) {
 508		ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
 509		if (ret) {
 510			fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
 511			goto out;
 512		}
 513		need_endtrans = 1;
 514	}
 515
 516	ret = gfs2_meta_inode_buffer(ip, &bh);
 517	if (ret == 0) {
 518		gfs2_trans_add_meta(ip->i_gl, bh);
 519		gfs2_dinode_out(ip, bh->b_data);
 520		brelse(bh);
 521	}
 522
 523	if (need_endtrans)
 524		gfs2_trans_end(sdp);
 525out:
 526	if (need_unlock)
 527		gfs2_glock_dq_uninit(&gh);
 528}
 529
 530/**
 531 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
 532 * @sdp: the filesystem
 533 *
 534 * Returns: errno
 535 */
 536
 537void gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 538{
 539	int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
 540
 541	if (!test_bit(SDF_KILL, &sdp->sd_flags))
 542		gfs2_flush_delete_work(sdp);
 543
 544	gfs2_destroy_threads(sdp);
 545
 546	if (log_write_allowed) {
 547		gfs2_quota_sync(sdp->sd_vfs, 0);
 548		gfs2_statfs_sync(sdp->sd_vfs, 0);
 549
 550		/* We do two log flushes here. The first one commits dirty inodes
 551		 * and rgrps to the journal, but queues up revokes to the ail list.
 552		 * The second flush writes out and removes the revokes.
 553		 *
 554		 * The first must be done before the FLUSH_SHUTDOWN code
 555		 * clears the LIVE flag, otherwise it will not be able to start
 556		 * a transaction to write its revokes, and the error will cause
 557		 * a withdraw of the file system. */
 558		gfs2_log_flush(sdp, NULL, GFS2_LFC_MAKE_FS_RO);
 559		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
 560			       GFS2_LFC_MAKE_FS_RO);
 561		wait_event_timeout(sdp->sd_log_waitq,
 562				   gfs2_log_is_empty(sdp),
 563				   HZ * 5);
 564		gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp));
 565	}
 566	gfs2_quota_cleanup(sdp);
 567}
 568
 569/**
 570 * gfs2_put_super - Unmount the filesystem
 571 * @sb: The VFS superblock
 572 *
 573 */
 574
 575static void gfs2_put_super(struct super_block *sb)
 576{
 577	struct gfs2_sbd *sdp = sb->s_fs_info;
 578	struct gfs2_jdesc *jd;
 579
 580	/* No more recovery requests */
 581	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
 582	smp_mb();
 583
 584	/* Wait on outstanding recovery */
 585restart:
 586	spin_lock(&sdp->sd_jindex_spin);
 587	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 588		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
 589			continue;
 590		spin_unlock(&sdp->sd_jindex_spin);
 591		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
 592			    TASK_UNINTERRUPTIBLE);
 593		goto restart;
 594	}
 595	spin_unlock(&sdp->sd_jindex_spin);
 596
 597	if (!sb_rdonly(sb))
 598		gfs2_make_fs_ro(sdp);
 599	else {
 600		if (gfs2_withdrawn(sdp))
 601			gfs2_destroy_threads(sdp);
 602
 603		gfs2_quota_cleanup(sdp);
 604	}
 605
 606	flush_work(&sdp->sd_withdraw_work);
 607
 608	/*  At this point, we're through modifying the disk  */
 609
 610	/*  Release stuff  */
 611
 612	gfs2_freeze_unlock(sdp);
 613
 614	iput(sdp->sd_jindex);
 615	iput(sdp->sd_statfs_inode);
 616	iput(sdp->sd_rindex);
 617	iput(sdp->sd_quota_inode);
 618
 619	gfs2_glock_put(sdp->sd_rename_gl);
 620	gfs2_glock_put(sdp->sd_freeze_gl);
 621
 622	if (!sdp->sd_args.ar_spectator) {
 623		if (gfs2_holder_initialized(&sdp->sd_journal_gh))
 624			gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
 625		if (gfs2_holder_initialized(&sdp->sd_jinode_gh))
 626			gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
 627		brelse(sdp->sd_sc_bh);
 628		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
 629		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
 630		free_local_statfs_inodes(sdp);
 631		iput(sdp->sd_qc_inode);
 632	}
 633
 634	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
 635	gfs2_clear_rgrpd(sdp);
 636	gfs2_jindex_free(sdp);
 637	/*  Take apart glock structures and buffer lists  */
 638	gfs2_gl_hash_clear(sdp);
 639	iput(sdp->sd_inode);
 640	gfs2_delete_debugfs_file(sdp);
 641
 642	gfs2_sys_fs_del(sdp);
 643	free_sbd(sdp);
 644}
 645
 646/**
 647 * gfs2_sync_fs - sync the filesystem
 648 * @sb: the superblock
 649 * @wait: true to wait for completion
 650 *
 651 * Flushes the log to disk.
 652 */
 653
 654static int gfs2_sync_fs(struct super_block *sb, int wait)
 655{
 656	struct gfs2_sbd *sdp = sb->s_fs_info;
 657
 658	gfs2_quota_sync(sb, -1);
 659	if (wait)
 660		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
 661			       GFS2_LFC_SYNC_FS);
 662	return sdp->sd_log_error;
 663}
 664
 665static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who, const void *freeze_owner)
 666{
 667	struct super_block *sb = sdp->sd_vfs;
 668	int error;
 669
 670	error = gfs2_freeze_lock_shared(sdp);
 671	if (error)
 672		goto fail;
 673	error = thaw_super(sb, who, freeze_owner);
 674	if (!error)
 675		return 0;
 676
 677fail:
 678	fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error);
 679	gfs2_assert_withdraw(sdp, 0);
 680	return error;
 681}
 682
 683void gfs2_freeze_func(struct work_struct *work)
 684{
 685	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work);
 686	struct super_block *sb = sdp->sd_vfs;
 687	int error;
 688
 689	mutex_lock(&sdp->sd_freeze_mutex);
 690	error = -EBUSY;
 691	if (test_bit(SDF_FROZEN, &sdp->sd_flags))
 692		goto freeze_failed;
 693
 694	error = freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
 695	if (error)
 696		goto freeze_failed;
 697
 698	gfs2_freeze_unlock(sdp);
 699	set_bit(SDF_FROZEN, &sdp->sd_flags);
 700
 701	error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE, NULL);
 702	if (error)
 703		goto out;
 704
 705	clear_bit(SDF_FROZEN, &sdp->sd_flags);
 706	goto out;
 707
 708freeze_failed:
 709	fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error);
 710
 711out:
 712	mutex_unlock(&sdp->sd_freeze_mutex);
 713	deactivate_super(sb);
 714}
 715
 716/**
 717 * gfs2_freeze_super - prevent further writes to the filesystem
 718 * @sb: the VFS structure for the filesystem
 719 * @who: freeze flags
 720 * @freeze_owner: owner of the freeze
 721 *
 722 */
 723
 724static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who,
 725			     const void *freeze_owner)
 726{
 727	struct gfs2_sbd *sdp = sb->s_fs_info;
 728	int error;
 729
 730	if (!mutex_trylock(&sdp->sd_freeze_mutex))
 731		return -EBUSY;
 732	if (test_bit(SDF_FROZEN, &sdp->sd_flags)) {
 733		mutex_unlock(&sdp->sd_freeze_mutex);
 734		return -EBUSY;
 735	}
 736
 737	for (;;) {
 738		error = freeze_super(sb, who, freeze_owner);
 739		if (error) {
 740			fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
 741				error);
 742			goto out;
 743		}
 744
 745		error = gfs2_lock_fs_check_clean(sdp);
 746		if (!error) {
 747			set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
 748			set_bit(SDF_FROZEN, &sdp->sd_flags);
 749			break;
 750		}
 751
 752		(void)gfs2_do_thaw(sdp, who, freeze_owner);
 753
 754		if (error == -EBUSY)
 755			fs_err(sdp, "waiting for recovery before freeze\n");
 756		else if (error == -EIO) {
 757			fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due "
 758			       "to recovery error.\n");
 759			goto out;
 760		} else {
 761			fs_err(sdp, "error freezing FS: %d\n", error);
 762		}
 763		fs_err(sdp, "retrying...\n");
 764		msleep(1000);
 765	}
 766
 767out:
 768	mutex_unlock(&sdp->sd_freeze_mutex);
 769	return error;
 770}
 771
 772static int gfs2_freeze_fs(struct super_block *sb)
 773{
 774	struct gfs2_sbd *sdp = sb->s_fs_info;
 775
 776	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
 777		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
 778			       GFS2_LFC_FREEZE_GO_SYNC);
 779		if (gfs2_withdrawn(sdp))
 780			return -EIO;
 781	}
 782	return 0;
 783}
 784
 785/**
 786 * gfs2_thaw_super - reallow writes to the filesystem
 787 * @sb: the VFS structure for the filesystem
 788 * @who: freeze flags
 789 * @freeze_owner: owner of the freeze
 790 *
 791 */
 792
 793static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who,
 794			   const void *freeze_owner)
 795{
 796	struct gfs2_sbd *sdp = sb->s_fs_info;
 797	int error;
 798
 799	if (!mutex_trylock(&sdp->sd_freeze_mutex))
 800		return -EBUSY;
 801	if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) {
 802		mutex_unlock(&sdp->sd_freeze_mutex);
 803		return -EINVAL;
 804	}
 805
 806	atomic_inc(&sb->s_active);
 807	gfs2_freeze_unlock(sdp);
 808
 809	error = gfs2_do_thaw(sdp, who, freeze_owner);
 810
 811	if (!error) {
 812		clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
 813		clear_bit(SDF_FROZEN, &sdp->sd_flags);
 814	}
 815	mutex_unlock(&sdp->sd_freeze_mutex);
 816	deactivate_super(sb);
 817	return error;
 818}
 819
 820/**
 821 * statfs_slow_fill - fill in the sg for a given RG
 822 * @rgd: the RG
 823 * @sc: the sc structure
 824 *
 825 * Returns: 0 on success, -ESTALE if the LVB is invalid
 826 */
 827
 828static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
 829			    struct gfs2_statfs_change_host *sc)
 830{
 831	gfs2_rgrp_verify(rgd);
 832	sc->sc_total += rgd->rd_data;
 833	sc->sc_free += rgd->rd_free;
 834	sc->sc_dinodes += rgd->rd_dinodes;
 835	return 0;
 836}
 837
 838/**
 839 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
 840 * @sdp: the filesystem
 841 * @sc: the sc info that will be returned
 842 *
 843 * Any error (other than a signal) will cause this routine to fall back
 844 * to the synchronous version.
 845 *
 846 * FIXME: This really shouldn't busy wait like this.
 847 *
 848 * Returns: errno
 849 */
 850
 851static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 852{
 853	struct gfs2_rgrpd *rgd_next;
 854	struct gfs2_holder *gha, *gh;
 855	unsigned int slots = 64;
 856	unsigned int x;
 857	int done;
 858	int error = 0, err;
 859
 860	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
 861	gha = kmalloc_array(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
 862	if (!gha)
 863		return -ENOMEM;
 864	for (x = 0; x < slots; x++)
 865		gfs2_holder_mark_uninitialized(gha + x);
 866
 867	rgd_next = gfs2_rgrpd_get_first(sdp);
 868
 869	for (;;) {
 870		done = 1;
 871
 872		for (x = 0; x < slots; x++) {
 873			gh = gha + x;
 874
 875			if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) {
 876				err = gfs2_glock_wait(gh);
 877				if (err) {
 878					gfs2_holder_uninit(gh);
 879					error = err;
 880				} else {
 881					if (!error) {
 882						struct gfs2_rgrpd *rgd =
 883							gfs2_glock2rgrp(gh->gh_gl);
 884
 885						error = statfs_slow_fill(rgd, sc);
 886					}
 887					gfs2_glock_dq_uninit(gh);
 888				}
 889			}
 890
 891			if (gfs2_holder_initialized(gh))
 892				done = 0;
 893			else if (rgd_next && !error) {
 894				error = gfs2_glock_nq_init(rgd_next->rd_gl,
 895							   LM_ST_SHARED,
 896							   GL_ASYNC,
 897							   gh);
 898				rgd_next = gfs2_rgrpd_get_next(rgd_next);
 899				done = 0;
 900			}
 901
 902			if (signal_pending(current))
 903				error = -ERESTARTSYS;
 904		}
 905
 906		if (done)
 907			break;
 908
 909		yield();
 910	}
 911
 912	kfree(gha);
 913	return error;
 914}
 915
 916/**
 917 * gfs2_statfs_i - Do a statfs
 918 * @sdp: the filesystem
 919 * @sc: the sc structure
 920 *
 921 * Returns: errno
 922 */
 923
 924static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 925{
 926	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 927	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 928
 929	spin_lock(&sdp->sd_statfs_spin);
 930
 931	*sc = *m_sc;
 932	sc->sc_total += l_sc->sc_total;
 933	sc->sc_free += l_sc->sc_free;
 934	sc->sc_dinodes += l_sc->sc_dinodes;
 935
 936	spin_unlock(&sdp->sd_statfs_spin);
 937
 938	if (sc->sc_free < 0)
 939		sc->sc_free = 0;
 940	if (sc->sc_free > sc->sc_total)
 941		sc->sc_free = sc->sc_total;
 942	if (sc->sc_dinodes < 0)
 943		sc->sc_dinodes = 0;
 944
 945	return 0;
 946}
 947
 948/**
 949 * gfs2_statfs - Gather and return stats about the filesystem
 950 * @dentry: The name of the link
 951 * @buf: The buffer
 952 *
 953 * Returns: 0 on success or error code
 954 */
 955
 956static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 957{
 958	struct super_block *sb = dentry->d_sb;
 959	struct gfs2_sbd *sdp = sb->s_fs_info;
 960	struct gfs2_statfs_change_host sc;
 961	int error;
 962
 963	error = gfs2_rindex_update(sdp);
 964	if (error)
 965		return error;
 966
 967	if (gfs2_tune_get(sdp, gt_statfs_slow))
 968		error = gfs2_statfs_slow(sdp, &sc);
 969	else
 970		error = gfs2_statfs_i(sdp, &sc);
 971
 972	if (error)
 973		return error;
 974
 975	buf->f_type = GFS2_MAGIC;
 976	buf->f_bsize = sdp->sd_sb.sb_bsize;
 977	buf->f_blocks = sc.sc_total;
 978	buf->f_bfree = sc.sc_free;
 979	buf->f_bavail = sc.sc_free;
 980	buf->f_files = sc.sc_dinodes + sc.sc_free;
 981	buf->f_ffree = sc.sc_free;
 982	buf->f_namelen = GFS2_FNAMESIZE;
 983	buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
 984
 985	return 0;
 986}
 987
 988/**
 989 * gfs2_drop_inode - Drop an inode (test for remote unlink)
 990 * @inode: The inode to drop
 991 *
 992 * If we've received a callback on an iopen lock then it's because a
 993 * remote node tried to deallocate the inode but failed due to this node
 994 * still having the inode open. Here we mark the link count zero
 995 * since we know that it must have reached zero if the GLF_DEMOTE flag
 996 * is set on the iopen glock. If we didn't do a disk read since the
 997 * remote node removed the final link then we might otherwise miss
 998 * this event. This check ensures that this node will deallocate the
 999 * inode's blocks, or alternatively pass the baton on to another
1000 * node for later deallocation.
1001 */
1002
1003static int gfs2_drop_inode(struct inode *inode)
1004{
1005	struct gfs2_inode *ip = GFS2_I(inode);
1006	struct gfs2_sbd *sdp = GFS2_SB(inode);
1007
1008	if (inode->i_nlink &&
1009	    gfs2_holder_initialized(&ip->i_iopen_gh)) {
1010		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1011		if (glock_needs_demote(gl))
1012			clear_nlink(inode);
1013	}
1014
1015	/*
1016	 * When under memory pressure when an inode's link count has dropped to
1017	 * zero, defer deleting the inode to the delete workqueue.  This avoids
1018	 * calling into DLM under memory pressure, which can deadlock.
1019	 */
1020	if (!inode->i_nlink &&
1021	    unlikely(current->flags & PF_MEMALLOC) &&
1022	    gfs2_holder_initialized(&ip->i_iopen_gh)) {
1023		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1024
1025		gfs2_glock_hold(gl);
1026		if (!gfs2_queue_verify_delete(gl, true))
1027			gfs2_glock_put_async(gl);
1028		return 0;
1029	}
1030
1031	/*
1032	 * No longer cache inodes when trying to evict them all.
1033	 */
1034	if (test_bit(SDF_EVICTING, &sdp->sd_flags))
1035		return 1;
1036
1037	return inode_generic_drop(inode);
1038}
1039
1040/**
1041 * gfs2_show_options - Show mount options for /proc/mounts
1042 * @s: seq_file structure
1043 * @root: root of this (sub)tree
1044 *
1045 * Returns: 0 on success or error code
1046 */
1047
1048static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1049{
1050	struct gfs2_sbd *sdp = root->d_sb->s_fs_info;
1051	struct gfs2_args *args = &sdp->sd_args;
1052	unsigned int logd_secs, statfs_slow, statfs_quantum, quota_quantum;
1053
1054	spin_lock(&sdp->sd_tune.gt_spin);
1055	logd_secs = sdp->sd_tune.gt_logd_secs;
1056	quota_quantum = sdp->sd_tune.gt_quota_quantum;
1057	statfs_quantum = sdp->sd_tune.gt_statfs_quantum;
1058	statfs_slow = sdp->sd_tune.gt_statfs_slow;
1059	spin_unlock(&sdp->sd_tune.gt_spin);
1060
1061	if (is_subdir(root, sdp->sd_master_dir))
1062		seq_puts(s, ",meta");
1063	if (args->ar_lockproto[0])
1064		seq_show_option(s, "lockproto", args->ar_lockproto);
1065	if (args->ar_locktable[0])
1066		seq_show_option(s, "locktable", args->ar_locktable);
1067	if (args->ar_hostdata[0])
1068		seq_show_option(s, "hostdata", args->ar_hostdata);
1069	if (args->ar_spectator)
1070		seq_puts(s, ",spectator");
1071	if (args->ar_localflocks)
1072		seq_puts(s, ",localflocks");
1073	if (args->ar_debug)
1074		seq_puts(s, ",debug");
1075	if (args->ar_posix_acl)
1076		seq_puts(s, ",acl");
1077	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
1078		char *state;
1079		switch (args->ar_quota) {
1080		case GFS2_QUOTA_OFF:
1081			state = "off";
1082			break;
1083		case GFS2_QUOTA_ACCOUNT:
1084			state = "account";
1085			break;
1086		case GFS2_QUOTA_ON:
1087			state = "on";
1088			break;
1089		case GFS2_QUOTA_QUIET:
1090			state = "quiet";
1091			break;
1092		default:
1093			state = "unknown";
1094			break;
1095		}
1096		seq_printf(s, ",quota=%s", state);
1097	}
1098	if (args->ar_suiddir)
1099		seq_puts(s, ",suiddir");
1100	if (args->ar_data != GFS2_DATA_DEFAULT) {
1101		char *state;
1102		switch (args->ar_data) {
1103		case GFS2_DATA_WRITEBACK:
1104			state = "writeback";
1105			break;
1106		case GFS2_DATA_ORDERED:
1107			state = "ordered";
1108			break;
1109		default:
1110			state = "unknown";
1111			break;
1112		}
1113		seq_printf(s, ",data=%s", state);
1114	}
1115	if (args->ar_discard)
1116		seq_puts(s, ",discard");
1117	if (logd_secs != 30)
1118		seq_printf(s, ",commit=%d", logd_secs);
1119	if (statfs_quantum != 30)
1120		seq_printf(s, ",statfs_quantum=%d", statfs_quantum);
1121	else if (statfs_slow)
1122		seq_puts(s, ",statfs_quantum=0");
1123	if (quota_quantum != 60)
1124		seq_printf(s, ",quota_quantum=%d", quota_quantum);
1125	if (args->ar_statfs_percent)
1126		seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
1127	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
1128		const char *state;
1129
1130		switch (args->ar_errors) {
1131		case GFS2_ERRORS_WITHDRAW:
1132			state = "withdraw";
1133			break;
1134		case GFS2_ERRORS_DEACTIVATE:
1135			state = "deactivate";
1136			break;
1137		case GFS2_ERRORS_PANIC:
1138			state = "panic";
1139			break;
1140		default:
1141			state = "unknown";
1142			break;
1143		}
1144		seq_printf(s, ",errors=%s", state);
1145	}
1146	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1147		seq_puts(s, ",nobarrier");
1148	if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1149		seq_puts(s, ",demote_interface_used");
1150	if (args->ar_rgrplvb)
1151		seq_puts(s, ",rgrplvb");
1152	if (args->ar_loccookie)
1153		seq_puts(s, ",loccookie");
1154	return 0;
1155}
1156
1157/**
1158 * gfs2_glock_put_eventually
1159 * @gl:	The glock to put
1160 *
1161 * When under memory pressure, trigger a deferred glock put to make sure we
1162 * won't call into DLM and deadlock.  Otherwise, put the glock directly.
1163 */
1164
1165static void gfs2_glock_put_eventually(struct gfs2_glock *gl)
1166{
1167	if (current->flags & PF_MEMALLOC)
1168		gfs2_glock_put_async(gl);
1169	else
1170		gfs2_glock_put(gl);
1171}
1172
1173static enum evict_behavior gfs2_upgrade_iopen_glock(struct inode *inode)
1174{
1175	struct gfs2_inode *ip = GFS2_I(inode);
1176	struct gfs2_sbd *sdp = GFS2_SB(inode);
1177	struct gfs2_holder *gh = &ip->i_iopen_gh;
1178	int error;
1179
1180	gh->gh_flags |= GL_NOCACHE;
1181	gfs2_glock_dq_wait(gh);
1182
1183	/*
1184	 * If there are no other lock holders, we will immediately get
1185	 * exclusive access to the iopen glock here.
1186	 *
1187	 * Otherwise, the other nodes holding the lock will be notified about
1188	 * our locking request (see iopen_go_callback()).  If they do not have
1189	 * the inode open, they are expected to evict the cached inode and
1190	 * release the lock, allowing us to proceed.
1191	 *
1192	 * Otherwise, if they cannot evict the inode, they are expected to poke
1193	 * the inode glock (note: not the iopen glock).  We will notice that
1194	 * and stop waiting for the iopen glock immediately.  The other node(s)
1195	 * are then expected to take care of deleting the inode when they no
1196	 * longer use it.
1197	 *
1198	 * As a last resort, if another node keeps holding the iopen glock
1199	 * without showing any activity on the inode glock, we will eventually
1200	 * time out and fail the iopen glock upgrade.
1201	 */
1202
1203	gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh);
1204	error = gfs2_glock_nq(gh);
1205	if (error)
1206		return EVICT_SHOULD_SKIP_DELETE;
1207
1208	wait_event_interruptible_timeout(sdp->sd_async_glock_wait,
1209		!test_bit(HIF_WAIT, &gh->gh_iflags) ||
1210		glock_needs_demote(ip->i_gl),
1211		5 * HZ);
1212	if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
1213		gfs2_glock_dq(gh);
1214		if (glock_needs_demote(ip->i_gl))
1215			return EVICT_SHOULD_SKIP_DELETE;
1216		return EVICT_SHOULD_DEFER_DELETE;
1217	}
1218	error = gfs2_glock_holder_ready(gh);
1219	if (error)
1220		return EVICT_SHOULD_SKIP_DELETE;
1221	return EVICT_SHOULD_DELETE;
1222}
1223
1224/**
1225 * evict_should_delete - determine whether the inode is eligible for deletion
1226 * @inode: The inode to evict
1227 * @gh: The glock holder structure
1228 *
1229 * This function determines whether the evicted inode is eligible to be deleted
1230 * and locks the inode glock.
1231 *
1232 * Returns: the fate of the dinode
1233 */
1234static enum evict_behavior evict_should_delete(struct inode *inode,
1235					       struct gfs2_holder *gh)
1236{
1237	struct gfs2_inode *ip = GFS2_I(inode);
1238	struct super_block *sb = inode->i_sb;
1239	struct gfs2_sbd *sdp = sb->s_fs_info;
1240	int ret;
1241
1242	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
1243	    test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags))
1244		return EVICT_SHOULD_DEFER_DELETE;
1245
1246	/* Deletes should never happen under memory pressure anymore.  */
1247	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
1248		return EVICT_SHOULD_DEFER_DELETE;
1249
1250	/* Must not read inode block until block type has been verified */
1251	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh);
1252	if (unlikely(ret))
1253		return EVICT_SHOULD_SKIP_DELETE;
1254
1255	if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino))
1256		return EVICT_SHOULD_SKIP_DELETE;
1257	ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1258	if (ret)
1259		return EVICT_SHOULD_SKIP_DELETE;
1260
1261	ret = gfs2_instantiate(gh);
1262	if (ret)
1263		return EVICT_SHOULD_SKIP_DELETE;
1264
1265	/*
1266	 * The inode may have been recreated in the meantime.
1267	 */
1268	if (inode->i_nlink)
1269		return EVICT_SHOULD_SKIP_DELETE;
1270
1271	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
1272	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
1273		return gfs2_upgrade_iopen_glock(inode);
1274	return EVICT_SHOULD_DELETE;
1275}
1276
1277/**
1278 * evict_unlinked_inode - delete the pieces of an unlinked evicted inode
1279 * @inode: The inode to evict
1280 */
1281static int evict_unlinked_inode(struct inode *inode)
1282{
1283	struct gfs2_inode *ip = GFS2_I(inode);
1284	int ret;
1285
1286	if (S_ISDIR(inode->i_mode) &&
1287	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
1288		ret = gfs2_dir_exhash_dealloc(ip);
1289		if (ret)
1290			goto out;
1291	}
1292
1293	if (ip->i_eattr) {
1294		ret = gfs2_ea_dealloc(ip, true);
1295		if (ret)
1296			goto out;
1297	}
1298
1299	if (!gfs2_is_stuffed(ip)) {
1300		ret = gfs2_file_dealloc(ip);
1301		if (ret)
1302			goto out;
1303	}
1304
1305	/*
1306	 * As soon as we clear the bitmap for the dinode, gfs2_create_inode()
1307	 * can get called to recreate it, or even gfs2_inode_lookup() if the
1308	 * inode was recreated on another node in the meantime.
1309	 *
1310	 * However, inserting the new inode into the inode hash table will not
1311	 * succeed until the old inode is removed, and that only happens after
1312	 * ->evict_inode() returns.  The new inode is attached to its inode and
1313	 *  iopen glocks after inserting it into the inode hash table, so at
1314	 *  that point we can be sure that both glocks are unused.
1315	 */
1316
1317	ret = gfs2_dinode_dealloc(ip);
1318	if (!ret && ip->i_gl)
1319		gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
1320
1321out:
1322	return ret;
1323}
1324
1325/*
1326 * evict_linked_inode - evict an inode whose dinode has not been unlinked
1327 * @inode: The inode to evict
1328 */
1329static int evict_linked_inode(struct inode *inode)
1330{
1331	struct super_block *sb = inode->i_sb;
1332	struct gfs2_sbd *sdp = sb->s_fs_info;
1333	struct gfs2_inode *ip = GFS2_I(inode);
1334	struct address_space *metamapping;
1335	int ret;
1336
1337	gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
1338		       GFS2_LFC_EVICT_INODE);
1339	metamapping = gfs2_glock2aspace(ip->i_gl);
1340	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
1341		filemap_fdatawrite(metamapping);
1342		filemap_fdatawait(metamapping);
1343	}
1344	write_inode_now(inode, 1);
1345	gfs2_ail_flush(ip->i_gl, 0);
1346
1347	ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1348	if (ret)
1349		return ret;
1350
1351	/* Needs to be done before glock release & also in a transaction */
1352	truncate_inode_pages(&inode->i_data, 0);
1353	truncate_inode_pages(metamapping, 0);
1354	gfs2_trans_end(sdp);
1355	return 0;
1356}
1357
1358/**
1359 * gfs2_evict_inode - Remove an inode from cache
1360 * @inode: The inode to evict
1361 *
1362 * There are three cases to consider:
1363 * 1. i_nlink == 0, we are final opener (and must deallocate)
1364 * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
1365 * 3. i_nlink > 0
1366 *
1367 * If the fs is read only, then we have to treat all cases as per #3
1368 * since we are unable to do any deallocation. The inode will be
1369 * deallocated by the next read/write node to attempt an allocation
1370 * in the same resource group
1371 *
1372 * We have to (at the moment) hold the inodes main lock to cover
1373 * the gap between unlocking the shared lock on the iopen lock and
1374 * taking the exclusive lock. I'd rather do a shared -> exclusive
1375 * conversion on the iopen lock, but we can change that later. This
1376 * is safe, just less efficient.
1377 */
1378
1379static void gfs2_evict_inode(struct inode *inode)
1380{
1381	struct super_block *sb = inode->i_sb;
1382	struct gfs2_sbd *sdp = sb->s_fs_info;
1383	struct gfs2_inode *ip = GFS2_I(inode);
1384	struct gfs2_holder gh;
1385	enum evict_behavior behavior;
1386	int ret;
1387
1388	gfs2_holder_mark_uninitialized(&gh);
1389	if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr)
1390		goto out;
1391
1392	/*
1393	 * In case of an incomplete mount, gfs2_evict_inode() may be called for
1394	 * system files without having an active journal to write to.  In that
1395	 * case, skip the filesystem evict.
1396	 */
1397	if (!sdp->sd_jdesc)
1398		goto out;
1399
1400	behavior = evict_should_delete(inode, &gh);
1401	if (behavior == EVICT_SHOULD_DEFER_DELETE &&
1402	    !test_bit(SDF_KILL, &sdp->sd_flags)) {
1403		struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl;
1404
1405		if (io_gl) {
1406			gfs2_glock_hold(io_gl);
1407			if (!gfs2_queue_verify_delete(io_gl, true))
1408				gfs2_glock_put(io_gl);
1409			goto out;
1410		}
1411		behavior = EVICT_SHOULD_SKIP_DELETE;
1412	}
1413	if (behavior == EVICT_SHOULD_DELETE)
1414		ret = evict_unlinked_inode(inode);
1415	else
1416		ret = evict_linked_inode(inode);
1417
1418	if (gfs2_rs_active(&ip->i_res))
1419		gfs2_rs_deltree(&ip->i_res);
1420
1421	if (ret && ret != GLR_TRYFAILED && ret != -EROFS)
1422		fs_warn(sdp, "gfs2_evict_inode: %d\n", ret);
1423out:
1424	if (gfs2_holder_initialized(&gh))
1425		gfs2_glock_dq_uninit(&gh);
1426	truncate_inode_pages_final(&inode->i_data);
1427	if (ip->i_qadata)
1428		gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
1429	gfs2_rs_deltree(&ip->i_res);
1430	gfs2_ordered_del_inode(ip);
1431	clear_inode(inode);
1432	gfs2_dir_hash_inval(ip);
1433	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
1434		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1435
1436		glock_clear_object(gl, ip);
1437		gfs2_glock_hold(gl);
1438		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1439		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1440		gfs2_glock_put_eventually(gl);
1441	}
1442	if (ip->i_gl) {
1443		glock_clear_object(ip->i_gl, ip);
1444		wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
1445		gfs2_glock_put_eventually(ip->i_gl);
1446		rcu_assign_pointer(ip->i_gl, NULL);
1447	}
1448}
1449
1450static struct inode *gfs2_alloc_inode(struct super_block *sb)
1451{
1452	struct gfs2_inode *ip;
1453
1454	ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
1455	if (!ip)
1456		return NULL;
1457	ip->i_no_addr = 0;
1458	ip->i_no_formal_ino = 0;
1459	ip->i_flags = 0;
1460	ip->i_gl = NULL;
1461	gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
1462	memset(&ip->i_res, 0, sizeof(ip->i_res));
1463	RB_CLEAR_NODE(&ip->i_res.rs_node);
1464	ip->i_diskflags = 0;
1465	ip->i_rahead = 0;
1466	return &ip->i_inode;
1467}
1468
1469static void gfs2_free_inode(struct inode *inode)
1470{
1471	kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
1472}
1473
1474void free_local_statfs_inodes(struct gfs2_sbd *sdp)
1475{
1476	struct local_statfs_inode *lsi, *safe;
1477
1478	/* Run through the statfs inodes list to iput and free memory */
1479	list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) {
1480		if (lsi->si_jid == sdp->sd_jdesc->jd_jid)
1481			sdp->sd_sc_inode = NULL; /* belongs to this node */
1482		if (lsi->si_sc_inode)
1483			iput(lsi->si_sc_inode);
1484		list_del(&lsi->si_list);
1485		kfree(lsi);
1486	}
1487}
1488
1489struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
1490				      unsigned int index)
1491{
1492	struct local_statfs_inode *lsi;
1493
1494	/* Return the local (per node) statfs inode in the
1495	 * sdp->sd_sc_inodes_list corresponding to the 'index'. */
1496	list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) {
1497		if (lsi->si_jid == index)
1498			return lsi->si_sc_inode;
1499	}
1500	return NULL;
1501}
1502
1503const struct super_operations gfs2_super_ops = {
1504	.alloc_inode		= gfs2_alloc_inode,
1505	.free_inode		= gfs2_free_inode,
1506	.write_inode		= gfs2_write_inode,
1507	.dirty_inode		= gfs2_dirty_inode,
1508	.evict_inode		= gfs2_evict_inode,
1509	.put_super		= gfs2_put_super,
1510	.sync_fs		= gfs2_sync_fs,
1511	.freeze_super		= gfs2_freeze_super,
1512	.freeze_fs		= gfs2_freeze_fs,
1513	.thaw_super		= gfs2_thaw_super,
1514	.statfs			= gfs2_statfs,
1515	.drop_inode		= gfs2_drop_inode,
1516	.show_options		= gfs2_show_options,
1517};
1518