fs/xfs/xfs_vnodeops.c at c9a28fa7b9ac19b676deefa0a171ce7df8755c08

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / xfs / xfs_vnodeops.c
at c9a28fa7b9ac19b676deefa0a171ce7df8755c08 4495 lines 114 kB view raw
wrap content
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18
  19#include "xfs.h"
  20#include "xfs_fs.h"
  21#include "xfs_types.h"
  22#include "xfs_bit.h"
  23#include "xfs_log.h"
  24#include "xfs_inum.h"
  25#include "xfs_trans.h"
  26#include "xfs_sb.h"
  27#include "xfs_ag.h"
  28#include "xfs_dir2.h"
  29#include "xfs_dmapi.h"
  30#include "xfs_mount.h"
  31#include "xfs_da_btree.h"
  32#include "xfs_bmap_btree.h"
  33#include "xfs_alloc_btree.h"
  34#include "xfs_ialloc_btree.h"
  35#include "xfs_dir2_sf.h"
  36#include "xfs_attr_sf.h"
  37#include "xfs_dinode.h"
  38#include "xfs_inode.h"
  39#include "xfs_inode_item.h"
  40#include "xfs_itable.h"
  41#include "xfs_btree.h"
  42#include "xfs_ialloc.h"
  43#include "xfs_alloc.h"
  44#include "xfs_bmap.h"
  45#include "xfs_attr.h"
  46#include "xfs_rw.h"
  47#include "xfs_error.h"
  48#include "xfs_quota.h"
  49#include "xfs_utils.h"
  50#include "xfs_rtalloc.h"
  51#include "xfs_refcache.h"
  52#include "xfs_trans_space.h"
  53#include "xfs_log_priv.h"
  54#include "xfs_filestream.h"
  55#include "xfs_vnodeops.h"
  56
  57int
  58xfs_open(
  59	xfs_inode_t	*ip)
  60{
  61	int		mode;
  62
  63	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  64		return XFS_ERROR(EIO);
  65
  66	/*
  67	 * If it's a directory with any blocks, read-ahead block 0
  68	 * as we're almost certain to have the next operation be a read there.
  69	 */
  70	if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  71		mode = xfs_ilock_map_shared(ip);
  72		if (ip->i_d.di_nextents > 0)
  73			(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  74		xfs_iunlock(ip, mode);
  75	}
  76	return 0;
  77}
  78
  79/*
  80 * xfs_getattr
  81 */
  82int
  83xfs_getattr(
  84	xfs_inode_t	*ip,
  85	bhv_vattr_t	*vap,
  86	int		flags)
  87{
  88	bhv_vnode_t	*vp = XFS_ITOV(ip);
  89	xfs_mount_t	*mp = ip->i_mount;
  90
  91	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
  92
  93	if (XFS_FORCED_SHUTDOWN(mp))
  94		return XFS_ERROR(EIO);
  95
  96	if (!(flags & ATTR_LAZY))
  97		xfs_ilock(ip, XFS_ILOCK_SHARED);
  98
  99	vap->va_size = XFS_ISIZE(ip);
 100	if (vap->va_mask == XFS_AT_SIZE)
 101		goto all_done;
 102
 103	vap->va_nblocks =
 104		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 105	vap->va_nodeid = ip->i_ino;
 106#if XFS_BIG_INUMS
 107	vap->va_nodeid += mp->m_inoadd;
 108#endif
 109	vap->va_nlink = ip->i_d.di_nlink;
 110
 111	/*
 112	 * Quick exit for non-stat callers
 113	 */
 114	if ((vap->va_mask &
 115	    ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 116	      XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 117		goto all_done;
 118
 119	/*
 120	 * Copy from in-core inode.
 121	 */
 122	vap->va_mode = ip->i_d.di_mode;
 123	vap->va_uid = ip->i_d.di_uid;
 124	vap->va_gid = ip->i_d.di_gid;
 125	vap->va_projid = ip->i_d.di_projid;
 126
 127	/*
 128	 * Check vnode type block/char vs. everything else.
 129	 */
 130	switch (ip->i_d.di_mode & S_IFMT) {
 131	case S_IFBLK:
 132	case S_IFCHR:
 133		vap->va_rdev = ip->i_df.if_u2.if_rdev;
 134		vap->va_blocksize = BLKDEV_IOSIZE;
 135		break;
 136	default:
 137		vap->va_rdev = 0;
 138
 139		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 140			vap->va_blocksize = xfs_preferred_iosize(mp);
 141		} else {
 142
 143			/*
 144			 * If the file blocks are being allocated from a
 145			 * realtime partition, then return the inode's
 146			 * realtime extent size or the realtime volume's
 147			 * extent size.
 148			 */
 149			vap->va_blocksize =
 150				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 151		}
 152		break;
 153	}
 154
 155	vn_atime_to_timespec(vp, &vap->va_atime);
 156	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 157	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 158	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 159	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 160
 161	/*
 162	 * Exit for stat callers.  See if any of the rest of the fields
 163	 * to be filled in are needed.
 164	 */
 165	if ((vap->va_mask &
 166	     (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 167	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 168		goto all_done;
 169
 170	/*
 171	 * Convert di_flags to xflags.
 172	 */
 173	vap->va_xflags = xfs_ip2xflags(ip);
 174
 175	/*
 176	 * Exit for inode revalidate.  See if any of the rest of
 177	 * the fields to be filled in are needed.
 178	 */
 179	if ((vap->va_mask &
 180	     (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 181	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 182		goto all_done;
 183
 184	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 185	vap->va_nextents =
 186		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
 187			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 188			ip->i_d.di_nextents;
 189	if (ip->i_afp)
 190		vap->va_anextents =
 191			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 192				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 193				 ip->i_d.di_anextents;
 194	else
 195		vap->va_anextents = 0;
 196	vap->va_gen = ip->i_d.di_gen;
 197
 198 all_done:
 199	if (!(flags & ATTR_LAZY))
 200		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 201	return 0;
 202}
 203
 204
 205/*
 206 * xfs_setattr
 207 */
 208int
 209xfs_setattr(
 210	xfs_inode_t		*ip,
 211	bhv_vattr_t		*vap,
 212	int			flags,
 213	cred_t			*credp)
 214{
 215	bhv_vnode_t		*vp = XFS_ITOV(ip);
 216	xfs_mount_t		*mp = ip->i_mount;
 217	xfs_trans_t		*tp;
 218	int			mask;
 219	int			code;
 220	uint			lock_flags;
 221	uint			commit_flags=0;
 222	uid_t			uid=0, iuid=0;
 223	gid_t			gid=0, igid=0;
 224	int			timeflags = 0;
 225	xfs_prid_t		projid=0, iprojid=0;
 226	int			mandlock_before, mandlock_after;
 227	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 228	int			file_owner;
 229	int			need_iolock = 1;
 230
 231	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 232
 233	if (mp->m_flags & XFS_MOUNT_RDONLY)
 234		return XFS_ERROR(EROFS);
 235
 236	/*
 237	 * Cannot set certain attributes.
 238	 */
 239	mask = vap->va_mask;
 240	if (mask & XFS_AT_NOSET) {
 241		return XFS_ERROR(EINVAL);
 242	}
 243
 244	if (XFS_FORCED_SHUTDOWN(mp))
 245		return XFS_ERROR(EIO);
 246
 247	/*
 248	 * Timestamps do not need to be logged and hence do not
 249	 * need to be done within a transaction.
 250	 */
 251	if (mask & XFS_AT_UPDTIMES) {
 252		ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 253		timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 254			    ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 255			    ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 256		xfs_ichgtime(ip, timeflags);
 257		return 0;
 258	}
 259
 260	olddquot1 = olddquot2 = NULL;
 261	udqp = gdqp = NULL;
 262
 263	/*
 264	 * If disk quotas is on, we make sure that the dquots do exist on disk,
 265	 * before we start any other transactions. Trying to do this later
 266	 * is messy. We don't care to take a readlock to look at the ids
 267	 * in inode here, because we can't hold it across the trans_reserve.
 268	 * If the IDs do change before we take the ilock, we're covered
 269	 * because the i_*dquot fields will get updated anyway.
 270	 */
 271	if (XFS_IS_QUOTA_ON(mp) &&
 272	    (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 273		uint	qflags = 0;
 274
 275		if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 276			uid = vap->va_uid;
 277			qflags |= XFS_QMOPT_UQUOTA;
 278		} else {
 279			uid = ip->i_d.di_uid;
 280		}
 281		if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 282			gid = vap->va_gid;
 283			qflags |= XFS_QMOPT_GQUOTA;
 284		}  else {
 285			gid = ip->i_d.di_gid;
 286		}
 287		if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 288			projid = vap->va_projid;
 289			qflags |= XFS_QMOPT_PQUOTA;
 290		}  else {
 291			projid = ip->i_d.di_projid;
 292		}
 293		/*
 294		 * We take a reference when we initialize udqp and gdqp,
 295		 * so it is important that we never blindly double trip on
 296		 * the same variable. See xfs_create() for an example.
 297		 */
 298		ASSERT(udqp == NULL);
 299		ASSERT(gdqp == NULL);
 300		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 301					 &udqp, &gdqp);
 302		if (code)
 303			return code;
 304	}
 305
 306	/*
 307	 * For the other attributes, we acquire the inode lock and
 308	 * first do an error checking pass.
 309	 */
 310	tp = NULL;
 311	lock_flags = XFS_ILOCK_EXCL;
 312	if (flags & ATTR_NOLOCK)
 313		need_iolock = 0;
 314	if (!(mask & XFS_AT_SIZE)) {
 315		if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 316		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
 317			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 318			commit_flags = 0;
 319			if ((code = xfs_trans_reserve(tp, 0,
 320						     XFS_ICHANGE_LOG_RES(mp), 0,
 321						     0, 0))) {
 322				lock_flags = 0;
 323				goto error_return;
 324			}
 325		}
 326	} else {
 327		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 328		    !(flags & ATTR_DMI)) {
 329			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 330			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 331				vap->va_size, 0, dmflags, NULL);
 332			if (code) {
 333				lock_flags = 0;
 334				goto error_return;
 335			}
 336		}
 337		if (need_iolock)
 338			lock_flags |= XFS_IOLOCK_EXCL;
 339	}
 340
 341	xfs_ilock(ip, lock_flags);
 342
 343	/* boolean: are we the file owner? */
 344	file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 345
 346	/*
 347	 * Change various properties of a file.
 348	 * Only the owner or users with CAP_FOWNER
 349	 * capability may do these things.
 350	 */
 351	if (mask &
 352	    (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 353	     XFS_AT_GID|XFS_AT_PROJID)) {
 354		/*
 355		 * CAP_FOWNER overrides the following restrictions:
 356		 *
 357		 * The user ID of the calling process must be equal
 358		 * to the file owner ID, except in cases where the
 359		 * CAP_FSETID capability is applicable.
 360		 */
 361		if (!file_owner && !capable(CAP_FOWNER)) {
 362			code = XFS_ERROR(EPERM);
 363			goto error_return;
 364		}
 365
 366		/*
 367		 * CAP_FSETID overrides the following restrictions:
 368		 *
 369		 * The effective user ID of the calling process shall match
 370		 * the file owner when setting the set-user-ID and
 371		 * set-group-ID bits on that file.
 372		 *
 373		 * The effective group ID or one of the supplementary group
 374		 * IDs of the calling process shall match the group owner of
 375		 * the file when setting the set-group-ID bit on that file
 376		 */
 377		if (mask & XFS_AT_MODE) {
 378			mode_t m = 0;
 379
 380			if ((vap->va_mode & S_ISUID) && !file_owner)
 381				m |= S_ISUID;
 382			if ((vap->va_mode & S_ISGID) &&
 383			    !in_group_p((gid_t)ip->i_d.di_gid))
 384				m |= S_ISGID;
 385#if 0
 386			/* Linux allows this, Irix doesn't. */
 387			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 388				m |= S_ISVTX;
 389#endif
 390			if (m && !capable(CAP_FSETID))
 391				vap->va_mode &= ~m;
 392		}
 393	}
 394
 395	/*
 396	 * Change file ownership.  Must be the owner or privileged.
 397	 * If the system was configured with the "restricted_chown"
 398	 * option, the owner is not permitted to give away the file,
 399	 * and can change the group id only to a group of which he
 400	 * or she is a member.
 401	 */
 402	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 403		/*
 404		 * These IDs could have changed since we last looked at them.
 405		 * But, we're assured that if the ownership did change
 406		 * while we didn't have the inode locked, inode's dquot(s)
 407		 * would have changed also.
 408		 */
 409		iuid = ip->i_d.di_uid;
 410		iprojid = ip->i_d.di_projid;
 411		igid = ip->i_d.di_gid;
 412		gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 413		uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 414		projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 415			 iprojid;
 416
 417		/*
 418		 * CAP_CHOWN overrides the following restrictions:
 419		 *
 420		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 421		 * shall override the restriction that a process cannot
 422		 * change the user ID of a file it owns and the restriction
 423		 * that the group ID supplied to the chown() function
 424		 * shall be equal to either the group ID or one of the
 425		 * supplementary group IDs of the calling process.
 426		 */
 427		if (restricted_chown &&
 428		    (iuid != uid || (igid != gid &&
 429				     !in_group_p((gid_t)gid))) &&
 430		    !capable(CAP_CHOWN)) {
 431			code = XFS_ERROR(EPERM);
 432			goto error_return;
 433		}
 434		/*
 435		 * Do a quota reservation only if uid/projid/gid is actually
 436		 * going to change.
 437		 */
 438		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 439		    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 440		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 441			ASSERT(tp);
 442			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 443						capable(CAP_FOWNER) ?
 444						XFS_QMOPT_FORCE_RES : 0);
 445			if (code)	/* out of quota */
 446				goto error_return;
 447		}
 448	}
 449
 450	/*
 451	 * Truncate file.  Must have write permission and not be a directory.
 452	 */
 453	if (mask & XFS_AT_SIZE) {
 454		/* Short circuit the truncate case for zero length files */
 455		if ((vap->va_size == 0) &&
 456		   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 457			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 458			lock_flags &= ~XFS_ILOCK_EXCL;
 459			if (mask & XFS_AT_CTIME)
 460				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 461			code = 0;
 462			goto error_return;
 463		}
 464
 465		if (VN_ISDIR(vp)) {
 466			code = XFS_ERROR(EISDIR);
 467			goto error_return;
 468		} else if (!VN_ISREG(vp)) {
 469			code = XFS_ERROR(EINVAL);
 470			goto error_return;
 471		}
 472		/*
 473		 * Make sure that the dquots are attached to the inode.
 474		 */
 475		if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 476			goto error_return;
 477	}
 478
 479	/*
 480	 * Change file access or modified times.
 481	 */
 482	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 483		if (!file_owner) {
 484			if ((flags & ATTR_UTIME) &&
 485			    !capable(CAP_FOWNER)) {
 486				code = XFS_ERROR(EPERM);
 487				goto error_return;
 488			}
 489		}
 490	}
 491
 492	/*
 493	 * Change extent size or realtime flag.
 494	 */
 495	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 496		/*
 497		 * Can't change extent size if any extents are allocated.
 498		 */
 499		if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 500		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 501		     vap->va_extsize) ) {
 502			code = XFS_ERROR(EINVAL);	/* EFBIG? */
 503			goto error_return;
 504		}
 505
 506		/*
 507		 * Can't change realtime flag if any extents are allocated.
 508		 */
 509		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 510		    (mask & XFS_AT_XFLAGS) &&
 511		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 512		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 513			code = XFS_ERROR(EINVAL);	/* EFBIG? */
 514			goto error_return;
 515		}
 516		/*
 517		 * Extent size must be a multiple of the appropriate block
 518		 * size, if set at all.
 519		 */
 520		if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 521			xfs_extlen_t	size;
 522
 523			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 524			    ((mask & XFS_AT_XFLAGS) &&
 525			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 526				size = mp->m_sb.sb_rextsize <<
 527				       mp->m_sb.sb_blocklog;
 528			} else {
 529				size = mp->m_sb.sb_blocksize;
 530			}
 531			if (vap->va_extsize % size) {
 532				code = XFS_ERROR(EINVAL);
 533				goto error_return;
 534			}
 535		}
 536		/*
 537		 * If realtime flag is set then must have realtime data.
 538		 */
 539		if ((mask & XFS_AT_XFLAGS) &&
 540		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 541			if ((mp->m_sb.sb_rblocks == 0) ||
 542			    (mp->m_sb.sb_rextsize == 0) ||
 543			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 544				code = XFS_ERROR(EINVAL);
 545				goto error_return;
 546			}
 547		}
 548
 549		/*
 550		 * Can't modify an immutable/append-only file unless
 551		 * we have appropriate permission.
 552		 */
 553		if ((mask & XFS_AT_XFLAGS) &&
 554		    (ip->i_d.di_flags &
 555				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 556		     (vap->va_xflags &
 557				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 558		    !capable(CAP_LINUX_IMMUTABLE)) {
 559			code = XFS_ERROR(EPERM);
 560			goto error_return;
 561		}
 562	}
 563
 564	/*
 565	 * Now we can make the changes.  Before we join the inode
 566	 * to the transaction, if XFS_AT_SIZE is set then take care of
 567	 * the part of the truncation that must be done without the
 568	 * inode lock.  This needs to be done before joining the inode
 569	 * to the transaction, because the inode cannot be unlocked
 570	 * once it is a part of the transaction.
 571	 */
 572	if (mask & XFS_AT_SIZE) {
 573		code = 0;
 574		if ((vap->va_size > ip->i_size) &&
 575		    (flags & ATTR_NOSIZETOK) == 0) {
 576			code = xfs_igrow_start(ip, vap->va_size, credp);
 577		}
 578		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 579
 580		/*
 581		 * We are going to log the inode size change in this
 582		 * transaction so any previous writes that are beyond the on
 583		 * disk EOF and the new EOF that have not been written out need
 584		 * to be written here. If we do not write the data out, we
 585		 * expose ourselves to the null files problem.
 586		 *
 587		 * Only flush from the on disk size to the smaller of the in
 588		 * memory file size or the new size as that's the range we
 589		 * really care about here and prevents waiting for other data
 590		 * not within the range we care about here.
 591		 */
 592		if (!code &&
 593		    (ip->i_size != ip->i_d.di_size) &&
 594		    (vap->va_size > ip->i_d.di_size)) {
 595			code = xfs_flush_pages(ip,
 596					ip->i_d.di_size, vap->va_size,
 597					XFS_B_ASYNC, FI_NONE);
 598		}
 599
 600		/* wait for all I/O to complete */
 601		vn_iowait(ip);
 602
 603		if (!code)
 604			code = xfs_itruncate_data(ip, vap->va_size);
 605		if (code) {
 606			ASSERT(tp == NULL);
 607			lock_flags &= ~XFS_ILOCK_EXCL;
 608			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 609			goto error_return;
 610		}
 611		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 612		if ((code = xfs_trans_reserve(tp, 0,
 613					     XFS_ITRUNCATE_LOG_RES(mp), 0,
 614					     XFS_TRANS_PERM_LOG_RES,
 615					     XFS_ITRUNCATE_LOG_COUNT))) {
 616			xfs_trans_cancel(tp, 0);
 617			if (need_iolock)
 618				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 619			return code;
 620		}
 621		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 622		xfs_ilock(ip, XFS_ILOCK_EXCL);
 623	}
 624
 625	if (tp) {
 626		xfs_trans_ijoin(tp, ip, lock_flags);
 627		xfs_trans_ihold(tp, ip);
 628	}
 629
 630	/* determine whether mandatory locking mode changes */
 631	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 632
 633	/*
 634	 * Truncate file.  Must have write permission and not be a directory.
 635	 */
 636	if (mask & XFS_AT_SIZE) {
 637		if (vap->va_size > ip->i_size) {
 638			xfs_igrow_finish(tp, ip, vap->va_size,
 639			    !(flags & ATTR_DMI));
 640		} else if ((vap->va_size <= ip->i_size) ||
 641			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 642			/*
 643			 * signal a sync transaction unless
 644			 * we're truncating an already unlinked
 645			 * file on a wsync filesystem
 646			 */
 647			code = xfs_itruncate_finish(&tp, ip,
 648					    (xfs_fsize_t)vap->va_size,
 649					    XFS_DATA_FORK,
 650					    ((ip->i_d.di_nlink != 0 ||
 651					      !(mp->m_flags & XFS_MOUNT_WSYNC))
 652					     ? 1 : 0));
 653			if (code)
 654				goto abort_return;
 655			/*
 656			 * Truncated "down", so we're removing references
 657			 * to old data here - if we now delay flushing for
 658			 * a long time, we expose ourselves unduly to the
 659			 * notorious NULL files problem.  So, we mark this
 660			 * vnode and flush it when the file is closed, and
 661			 * do not wait the usual (long) time for writeout.
 662			 */
 663			xfs_iflags_set(ip, XFS_ITRUNCATED);
 664		}
 665		/*
 666		 * Have to do this even if the file's size doesn't change.
 667		 */
 668		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 669	}
 670
 671	/*
 672	 * Change file access modes.
 673	 */
 674	if (mask & XFS_AT_MODE) {
 675		ip->i_d.di_mode &= S_IFMT;
 676		ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 677
 678		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 679		timeflags |= XFS_ICHGTIME_CHG;
 680	}
 681
 682	/*
 683	 * Change file ownership.  Must be the owner or privileged.
 684	 * If the system was configured with the "restricted_chown"
 685	 * option, the owner is not permitted to give away the file,
 686	 * and can change the group id only to a group of which he
 687	 * or she is a member.
 688	 */
 689	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 690		/*
 691		 * CAP_FSETID overrides the following restrictions:
 692		 *
 693		 * The set-user-ID and set-group-ID bits of a file will be
 694		 * cleared upon successful return from chown()
 695		 */
 696		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 697		    !capable(CAP_FSETID)) {
 698			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 699		}
 700
 701		/*
 702		 * Change the ownerships and register quota modifications
 703		 * in the transaction.
 704		 */
 705		if (iuid != uid) {
 706			if (XFS_IS_UQUOTA_ON(mp)) {
 707				ASSERT(mask & XFS_AT_UID);
 708				ASSERT(udqp);
 709				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 710							&ip->i_udquot, udqp);
 711			}
 712			ip->i_d.di_uid = uid;
 713		}
 714		if (igid != gid) {
 715			if (XFS_IS_GQUOTA_ON(mp)) {
 716				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 717				ASSERT(mask & XFS_AT_GID);
 718				ASSERT(gdqp);
 719				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 720							&ip->i_gdquot, gdqp);
 721			}
 722			ip->i_d.di_gid = gid;
 723		}
 724		if (iprojid != projid) {
 725			if (XFS_IS_PQUOTA_ON(mp)) {
 726				ASSERT(!XFS_IS_GQUOTA_ON(mp));
 727				ASSERT(mask & XFS_AT_PROJID);
 728				ASSERT(gdqp);
 729				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 730							&ip->i_gdquot, gdqp);
 731			}
 732			ip->i_d.di_projid = projid;
 733			/*
 734			 * We may have to rev the inode as well as
 735			 * the superblock version number since projids didn't
 736			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 737			 */
 738			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 739				xfs_bump_ino_vers2(tp, ip);
 740		}
 741
 742		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 743		timeflags |= XFS_ICHGTIME_CHG;
 744	}
 745
 746
 747	/*
 748	 * Change file access or modified times.
 749	 */
 750	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 751		if (mask & XFS_AT_ATIME) {
 752			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 753			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 754			ip->i_update_core = 1;
 755			timeflags &= ~XFS_ICHGTIME_ACC;
 756		}
 757		if (mask & XFS_AT_MTIME) {
 758			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 759			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 760			timeflags &= ~XFS_ICHGTIME_MOD;
 761			timeflags |= XFS_ICHGTIME_CHG;
 762		}
 763		if (tp && (flags & ATTR_UTIME))
 764			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 765	}
 766
 767	/*
 768	 * Change XFS-added attributes.
 769	 */
 770	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 771		if (mask & XFS_AT_EXTSIZE) {
 772			/*
 773			 * Converting bytes to fs blocks.
 774			 */
 775			ip->i_d.di_extsize = vap->va_extsize >>
 776				mp->m_sb.sb_blocklog;
 777		}
 778		if (mask & XFS_AT_XFLAGS) {
 779			uint	di_flags;
 780
 781			/* can't set PREALLOC this way, just preserve it */
 782			di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 783			if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 784				di_flags |= XFS_DIFLAG_IMMUTABLE;
 785			if (vap->va_xflags & XFS_XFLAG_APPEND)
 786				di_flags |= XFS_DIFLAG_APPEND;
 787			if (vap->va_xflags & XFS_XFLAG_SYNC)
 788				di_flags |= XFS_DIFLAG_SYNC;
 789			if (vap->va_xflags & XFS_XFLAG_NOATIME)
 790				di_flags |= XFS_DIFLAG_NOATIME;
 791			if (vap->va_xflags & XFS_XFLAG_NODUMP)
 792				di_flags |= XFS_DIFLAG_NODUMP;
 793			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 794				di_flags |= XFS_DIFLAG_PROJINHERIT;
 795			if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 796				di_flags |= XFS_DIFLAG_NODEFRAG;
 797			if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 798				di_flags |= XFS_DIFLAG_FILESTREAM;
 799			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 800				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 801					di_flags |= XFS_DIFLAG_RTINHERIT;
 802				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 803					di_flags |= XFS_DIFLAG_NOSYMLINKS;
 804				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 805					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 806			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 807				if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 808					di_flags |= XFS_DIFLAG_REALTIME;
 809					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 810				} else {
 811					ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 812				}
 813				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 814					di_flags |= XFS_DIFLAG_EXTSIZE;
 815			}
 816			ip->i_d.di_flags = di_flags;
 817		}
 818		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 819		timeflags |= XFS_ICHGTIME_CHG;
 820	}
 821
 822	/*
 823	 * Change file inode change time only if XFS_AT_CTIME set
 824	 * AND we have been called by a DMI function.
 825	 */
 826
 827	if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 828		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 829		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 830		ip->i_update_core = 1;
 831		timeflags &= ~XFS_ICHGTIME_CHG;
 832	}
 833
 834	/*
 835	 * Send out timestamp changes that need to be set to the
 836	 * current time.  Not done when called by a DMI function.
 837	 */
 838	if (timeflags && !(flags & ATTR_DMI))
 839		xfs_ichgtime(ip, timeflags);
 840
 841	XFS_STATS_INC(xs_ig_attrchg);
 842
 843	/*
 844	 * If this is a synchronous mount, make sure that the
 845	 * transaction goes to disk before returning to the user.
 846	 * This is slightly sub-optimal in that truncates require
 847	 * two sync transactions instead of one for wsync filesystems.
 848	 * One for the truncate and one for the timestamps since we
 849	 * don't want to change the timestamps unless we're sure the
 850	 * truncate worked.  Truncates are less than 1% of the laddis
 851	 * mix so this probably isn't worth the trouble to optimize.
 852	 */
 853	code = 0;
 854	if (tp) {
 855		if (mp->m_flags & XFS_MOUNT_WSYNC)
 856			xfs_trans_set_sync(tp);
 857
 858		code = xfs_trans_commit(tp, commit_flags);
 859	}
 860
 861	/*
 862	 * If the (regular) file's mandatory locking mode changed, then
 863	 * notify the vnode.  We do this under the inode lock to prevent
 864	 * racing calls to vop_vnode_change.
 865	 */
 866	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 867
 868	xfs_iunlock(ip, lock_flags);
 869
 870	/*
 871	 * Release any dquot(s) the inode had kept before chown.
 872	 */
 873	XFS_QM_DQRELE(mp, olddquot1);
 874	XFS_QM_DQRELE(mp, olddquot2);
 875	XFS_QM_DQRELE(mp, udqp);
 876	XFS_QM_DQRELE(mp, gdqp);
 877
 878	if (code) {
 879		return code;
 880	}
 881
 882	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 883	    !(flags & ATTR_DMI)) {
 884		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 885					NULL, DM_RIGHT_NULL, NULL, NULL,
 886					0, 0, AT_DELAY_FLAG(flags));
 887	}
 888	return 0;
 889
 890 abort_return:
 891	commit_flags |= XFS_TRANS_ABORT;
 892	/* FALLTHROUGH */
 893 error_return:
 894	XFS_QM_DQRELE(mp, udqp);
 895	XFS_QM_DQRELE(mp, gdqp);
 896	if (tp) {
 897		xfs_trans_cancel(tp, commit_flags);
 898	}
 899	if (lock_flags != 0) {
 900		xfs_iunlock(ip, lock_flags);
 901	}
 902	return code;
 903}
 904
 905
 906/*
 907 * xfs_access
 908 * Null conversion from vnode mode bits to inode mode bits, as in efs.
 909 */
 910int
 911xfs_access(
 912	xfs_inode_t	*ip,
 913	int		mode,
 914	cred_t		*credp)
 915{
 916	int		error;
 917
 918	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 919
 920	xfs_ilock(ip, XFS_ILOCK_SHARED);
 921	error = xfs_iaccess(ip, mode, credp);
 922	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 923	return error;
 924}
 925
 926
 927/*
 928 * The maximum pathlen is 1024 bytes. Since the minimum file system
 929 * blocksize is 512 bytes, we can get a max of 2 extents back from
 930 * bmapi.
 931 */
 932#define SYMLINK_MAPS 2
 933
 934STATIC int
 935xfs_readlink_bmap(
 936	xfs_inode_t	*ip,
 937	char		*link)
 938{
 939	xfs_mount_t	*mp = ip->i_mount;
 940	int		pathlen = ip->i_d.di_size;
 941	int             nmaps = SYMLINK_MAPS;
 942	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 943	xfs_daddr_t	d;
 944	int		byte_cnt;
 945	int		n;
 946	xfs_buf_t	*bp;
 947	int		error = 0;
 948
 949	error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 950			mval, &nmaps, NULL, NULL);
 951	if (error)
 952		goto out;
 953
 954	for (n = 0; n < nmaps; n++) {
 955		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 956		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 957
 958		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 959		error = XFS_BUF_GETERROR(bp);
 960		if (error) {
 961			xfs_ioerror_alert("xfs_readlink",
 962				  ip->i_mount, bp, XFS_BUF_ADDR(bp));
 963			xfs_buf_relse(bp);
 964			goto out;
 965		}
 966		if (pathlen < byte_cnt)
 967			byte_cnt = pathlen;
 968		pathlen -= byte_cnt;
 969
 970		memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 971		xfs_buf_relse(bp);
 972	}
 973
 974	link[ip->i_d.di_size] = '\0';
 975	error = 0;
 976
 977 out:
 978	return error;
 979}
 980
 981int
 982xfs_readlink(
 983	xfs_inode_t     *ip,
 984	char		*link)
 985{
 986	xfs_mount_t	*mp = ip->i_mount;
 987	int		pathlen;
 988	int		error = 0;
 989
 990	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
 991
 992	if (XFS_FORCED_SHUTDOWN(mp))
 993		return XFS_ERROR(EIO);
 994
 995	xfs_ilock(ip, XFS_ILOCK_SHARED);
 996
 997	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 998	ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 999
1000	pathlen = ip->i_d.di_size;
1001	if (!pathlen)
1002		goto out;
1003
1004	if (ip->i_df.if_flags & XFS_IFINLINE) {
1005		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
1006		link[pathlen] = '\0';
1007	} else {
1008		error = xfs_readlink_bmap(ip, link);
1009	}
1010
1011 out:
1012	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1013	return error;
1014}
1015
1016/*
1017 * xfs_fsync
1018 *
1019 * This is called to sync the inode and its data out to disk.
1020 * We need to hold the I/O lock while flushing the data, and
1021 * the inode lock while flushing the inode.  The inode lock CANNOT
1022 * be held while flushing the data, so acquire after we're done
1023 * with that.
1024 */
1025int
1026xfs_fsync(
1027	xfs_inode_t	*ip,
1028	int		flag,
1029	xfs_off_t	start,
1030	xfs_off_t	stop)
1031{
1032	xfs_trans_t	*tp;
1033	int		error;
1034	int		log_flushed = 0, changed = 1;
1035
1036	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
1037
1038	ASSERT(start >= 0 && stop >= -1);
1039
1040	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1041		return XFS_ERROR(EIO);
1042
1043	if (flag & FSYNC_DATA)
1044		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1045
1046	/*
1047	 * We always need to make sure that the required inode state
1048	 * is safe on disk.  The vnode might be clean but because
1049	 * of committed transactions that haven't hit the disk yet.
1050	 * Likewise, there could be unflushed non-transactional
1051	 * changes to the inode core that have to go to disk.
1052	 *
1053	 * The following code depends on one assumption:  that
1054	 * any transaction that changes an inode logs the core
1055	 * because it has to change some field in the inode core
1056	 * (typically nextents or nblocks).  That assumption
1057	 * implies that any transactions against an inode will
1058	 * catch any non-transactional updates.  If inode-altering
1059	 * transactions exist that violate this assumption, the
1060	 * code breaks.  Right now, it figures that if the involved
1061	 * update_* field is clear and the inode is unpinned, the
1062	 * inode is clean.  Either it's been flushed or it's been
1063	 * committed and the commit has hit the disk unpinning the inode.
1064	 * (Note that xfs_inode_item_format() called at commit clears
1065	 * the update_* fields.)
1066	 */
1067	xfs_ilock(ip, XFS_ILOCK_SHARED);
1068
1069	/* If we are flushing data then we care about update_size
1070	 * being set, otherwise we care about update_core
1071	 */
1072	if ((flag & FSYNC_DATA) ?
1073			(ip->i_update_size == 0) :
1074			(ip->i_update_core == 0)) {
1075		/*
1076		 * Timestamps/size haven't changed since last inode
1077		 * flush or inode transaction commit.  That means
1078		 * either nothing got written or a transaction
1079		 * committed which caught the updates.	If the
1080		 * latter happened and the transaction hasn't
1081		 * hit the disk yet, the inode will be still
1082		 * be pinned.  If it is, force the log.
1083		 */
1084
1085		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1086
1087		if (xfs_ipincount(ip)) {
1088			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1089				      XFS_LOG_FORCE |
1090				      ((flag & FSYNC_WAIT)
1091				       ? XFS_LOG_SYNC : 0),
1092				      &log_flushed);
1093		} else {
1094			/*
1095			 * If the inode is not pinned and nothing
1096			 * has changed we don't need to flush the
1097			 * cache.
1098			 */
1099			changed = 0;
1100		}
1101		error = 0;
1102	} else	{
1103		/*
1104		 * Kick off a transaction to log the inode
1105		 * core to get the updates.  Make it
1106		 * sync if FSYNC_WAIT is passed in (which
1107		 * is done by everybody but specfs).  The
1108		 * sync transaction will also force the log.
1109		 */
1110		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1111		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1112		if ((error = xfs_trans_reserve(tp, 0,
1113				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1114				0, 0, 0)))  {
1115			xfs_trans_cancel(tp, 0);
1116			return error;
1117		}
1118		xfs_ilock(ip, XFS_ILOCK_EXCL);
1119
1120		/*
1121		 * Note - it's possible that we might have pushed
1122		 * ourselves out of the way during trans_reserve
1123		 * which would flush the inode.	 But there's no
1124		 * guarantee that the inode buffer has actually
1125		 * gone out yet (it's delwri).	Plus the buffer
1126		 * could be pinned anyway if it's part of an
1127		 * inode in another recent transaction.	 So we
1128		 * play it safe and fire off the transaction anyway.
1129		 */
1130		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1131		xfs_trans_ihold(tp, ip);
1132		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1133		if (flag & FSYNC_WAIT)
1134			xfs_trans_set_sync(tp);
1135		error = _xfs_trans_commit(tp, 0, &log_flushed);
1136
1137		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1138	}
1139
1140	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1141		/*
1142		 * If the log write didn't issue an ordered tag we need
1143		 * to flush the disk cache for the data device now.
1144		 */
1145		if (!log_flushed)
1146			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1147
1148		/*
1149		 * If this inode is on the RT dev we need to flush that
1150		 * cache as well.
1151		 */
1152		if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1153			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1154	}
1155
1156	return error;
1157}
1158
1159/*
1160 * This is called by xfs_inactive to free any blocks beyond eof
1161 * when the link count isn't zero and by xfs_dm_punch_hole() when
1162 * punching a hole to EOF.
1163 */
1164int
1165xfs_free_eofblocks(
1166	xfs_mount_t	*mp,
1167	xfs_inode_t	*ip,
1168	int		flags)
1169{
1170	xfs_trans_t	*tp;
1171	int		error;
1172	xfs_fileoff_t	end_fsb;
1173	xfs_fileoff_t	last_fsb;
1174	xfs_filblks_t	map_len;
1175	int		nimaps;
1176	xfs_bmbt_irec_t	imap;
1177	int		use_iolock = (flags & XFS_FREE_EOF_LOCK);
1178
1179	/*
1180	 * Figure out if there are any blocks beyond the end
1181	 * of the file.  If not, then there is nothing to do.
1182	 */
1183	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1184	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1185	map_len = last_fsb - end_fsb;
1186	if (map_len <= 0)
1187		return 0;
1188
1189	nimaps = 1;
1190	xfs_ilock(ip, XFS_ILOCK_SHARED);
1191	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1192			  NULL, 0, &imap, &nimaps, NULL, NULL);
1193	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1194
1195	if (!error && (nimaps != 0) &&
1196	    (imap.br_startblock != HOLESTARTBLOCK ||
1197	     ip->i_delayed_blks)) {
1198		/*
1199		 * Attach the dquots to the inode up front.
1200		 */
1201		if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1202			return error;
1203
1204		/*
1205		 * There are blocks after the end of file.
1206		 * Free them up now by truncating the file to
1207		 * its current size.
1208		 */
1209		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1210
1211		/*
1212		 * Do the xfs_itruncate_start() call before
1213		 * reserving any log space because
1214		 * itruncate_start will call into the buffer
1215		 * cache and we can't
1216		 * do that within a transaction.
1217		 */
1218		if (use_iolock)
1219			xfs_ilock(ip, XFS_IOLOCK_EXCL);
1220		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1221				    ip->i_size);
1222		if (error) {
1223			xfs_trans_cancel(tp, 0);
1224			if (use_iolock)
1225				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1226			return error;
1227		}
1228
1229		error = xfs_trans_reserve(tp, 0,
1230					  XFS_ITRUNCATE_LOG_RES(mp),
1231					  0, XFS_TRANS_PERM_LOG_RES,
1232					  XFS_ITRUNCATE_LOG_COUNT);
1233		if (error) {
1234			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1235			xfs_trans_cancel(tp, 0);
1236			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1237			return error;
1238		}
1239
1240		xfs_ilock(ip, XFS_ILOCK_EXCL);
1241		xfs_trans_ijoin(tp, ip,
1242				XFS_IOLOCK_EXCL |
1243				XFS_ILOCK_EXCL);
1244		xfs_trans_ihold(tp, ip);
1245
1246		error = xfs_itruncate_finish(&tp, ip,
1247					     ip->i_size,
1248					     XFS_DATA_FORK,
1249					     0);
1250		/*
1251		 * If we get an error at this point we
1252		 * simply don't bother truncating the file.
1253		 */
1254		if (error) {
1255			xfs_trans_cancel(tp,
1256					 (XFS_TRANS_RELEASE_LOG_RES |
1257					  XFS_TRANS_ABORT));
1258		} else {
1259			error = xfs_trans_commit(tp,
1260						XFS_TRANS_RELEASE_LOG_RES);
1261		}
1262		xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1263					    : XFS_ILOCK_EXCL));
1264	}
1265	return error;
1266}
1267
1268/*
1269 * Free a symlink that has blocks associated with it.
1270 */
1271STATIC int
1272xfs_inactive_symlink_rmt(
1273	xfs_inode_t	*ip,
1274	xfs_trans_t	**tpp)
1275{
1276	xfs_buf_t	*bp;
1277	int		committed;
1278	int		done;
1279	int		error;
1280	xfs_fsblock_t	first_block;
1281	xfs_bmap_free_t	free_list;
1282	int		i;
1283	xfs_mount_t	*mp;
1284	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
1285	int		nmaps;
1286	xfs_trans_t	*ntp;
1287	int		size;
1288	xfs_trans_t	*tp;
1289
1290	tp = *tpp;
1291	mp = ip->i_mount;
1292	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1293	/*
1294	 * We're freeing a symlink that has some
1295	 * blocks allocated to it.  Free the
1296	 * blocks here.  We know that we've got
1297	 * either 1 or 2 extents and that we can
1298	 * free them all in one bunmapi call.
1299	 */
1300	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1301	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1302			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1303		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1304		xfs_trans_cancel(tp, 0);
1305		*tpp = NULL;
1306		return error;
1307	}
1308	/*
1309	 * Lock the inode, fix the size, and join it to the transaction.
1310	 * Hold it so in the normal path, we still have it locked for
1311	 * the second transaction.  In the error paths we need it
1312	 * held so the cancel won't rele it, see below.
1313	 */
1314	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1315	size = (int)ip->i_d.di_size;
1316	ip->i_d.di_size = 0;
1317	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1318	xfs_trans_ihold(tp, ip);
1319	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1320	/*
1321	 * Find the block(s) so we can inval and unmap them.
1322	 */
1323	done = 0;
1324	XFS_BMAP_INIT(&free_list, &first_block);
1325	nmaps = ARRAY_SIZE(mval);
1326	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1327			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1328			&free_list, NULL)))
1329		goto error0;
1330	/*
1331	 * Invalidate the block(s).
1332	 */
1333	for (i = 0; i < nmaps; i++) {
1334		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1335			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1336			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1337		xfs_trans_binval(tp, bp);
1338	}
1339	/*
1340	 * Unmap the dead block(s) to the free_list.
1341	 */
1342	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1343			&first_block, &free_list, NULL, &done)))
1344		goto error1;
1345	ASSERT(done);
1346	/*
1347	 * Commit the first transaction.  This logs the EFI and the inode.
1348	 */
1349	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1350		goto error1;
1351	/*
1352	 * The transaction must have been committed, since there were
1353	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1354	 * The new tp has the extent freeing and EFDs.
1355	 */
1356	ASSERT(committed);
1357	/*
1358	 * The first xact was committed, so add the inode to the new one.
1359	 * Mark it dirty so it will be logged and moved forward in the log as
1360	 * part of every commit.
1361	 */
1362	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1363	xfs_trans_ihold(tp, ip);
1364	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1365	/*
1366	 * Get a new, empty transaction to return to our caller.
1367	 */
1368	ntp = xfs_trans_dup(tp);
1369	/*
1370	 * Commit the transaction containing extent freeing and EFDs.
1371	 * If we get an error on the commit here or on the reserve below,
1372	 * we need to unlock the inode since the new transaction doesn't
1373	 * have the inode attached.
1374	 */
1375	error = xfs_trans_commit(tp, 0);
1376	tp = ntp;
1377	if (error) {
1378		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1379		goto error0;
1380	}
1381	/*
1382	 * Remove the memory for extent descriptions (just bookkeeping).
1383	 */
1384	if (ip->i_df.if_bytes)
1385		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1386	ASSERT(ip->i_df.if_bytes == 0);
1387	/*
1388	 * Put an itruncate log reservation in the new transaction
1389	 * for our caller.
1390	 */
1391	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1392			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1393		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1394		goto error0;
1395	}
1396	/*
1397	 * Return with the inode locked but not joined to the transaction.
1398	 */
1399	*tpp = tp;
1400	return 0;
1401
1402 error1:
1403	xfs_bmap_cancel(&free_list);
1404 error0:
1405	/*
1406	 * Have to come here with the inode locked and either
1407	 * (held and in the transaction) or (not in the transaction).
1408	 * If the inode isn't held then cancel would iput it, but
1409	 * that's wrong since this is inactive and the vnode ref
1410	 * count is 0 already.
1411	 * Cancel won't do anything to the inode if held, but it still
1412	 * needs to be locked until the cancel is done, if it was
1413	 * joined to the transaction.
1414	 */
1415	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1416	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1417	*tpp = NULL;
1418	return error;
1419
1420}
1421
1422STATIC int
1423xfs_inactive_symlink_local(
1424	xfs_inode_t	*ip,
1425	xfs_trans_t	**tpp)
1426{
1427	int		error;
1428
1429	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1430	/*
1431	 * We're freeing a symlink which fit into
1432	 * the inode.  Just free the memory used
1433	 * to hold the old symlink.
1434	 */
1435	error = xfs_trans_reserve(*tpp, 0,
1436				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1437				  0, XFS_TRANS_PERM_LOG_RES,
1438				  XFS_ITRUNCATE_LOG_COUNT);
1439
1440	if (error) {
1441		xfs_trans_cancel(*tpp, 0);
1442		*tpp = NULL;
1443		return error;
1444	}
1445	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1446
1447	/*
1448	 * Zero length symlinks _can_ exist.
1449	 */
1450	if (ip->i_df.if_bytes > 0) {
1451		xfs_idata_realloc(ip,
1452				  -(ip->i_df.if_bytes),
1453				  XFS_DATA_FORK);
1454		ASSERT(ip->i_df.if_bytes == 0);
1455	}
1456	return 0;
1457}
1458
1459STATIC int
1460xfs_inactive_attrs(
1461	xfs_inode_t	*ip,
1462	xfs_trans_t	**tpp)
1463{
1464	xfs_trans_t	*tp;
1465	int		error;
1466	xfs_mount_t	*mp;
1467
1468	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1469	tp = *tpp;
1470	mp = ip->i_mount;
1471	ASSERT(ip->i_d.di_forkoff != 0);
1472	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1473	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1474
1475	error = xfs_attr_inactive(ip);
1476	if (error) {
1477		*tpp = NULL;
1478		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1479		return error; /* goto out */
1480	}
1481
1482	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1483	error = xfs_trans_reserve(tp, 0,
1484				  XFS_IFREE_LOG_RES(mp),
1485				  0, XFS_TRANS_PERM_LOG_RES,
1486				  XFS_INACTIVE_LOG_COUNT);
1487	if (error) {
1488		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1489		xfs_trans_cancel(tp, 0);
1490		*tpp = NULL;
1491		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1492		return error;
1493	}
1494
1495	xfs_ilock(ip, XFS_ILOCK_EXCL);
1496	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1497	xfs_trans_ihold(tp, ip);
1498	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1499
1500	ASSERT(ip->i_d.di_anextents == 0);
1501
1502	*tpp = tp;
1503	return 0;
1504}
1505
1506int
1507xfs_release(
1508	xfs_inode_t	*ip)
1509{
1510	bhv_vnode_t	*vp = XFS_ITOV(ip);
1511	xfs_mount_t	*mp = ip->i_mount;
1512	int		error;
1513
1514	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1515		return 0;
1516
1517	/* If this is a read-only mount, don't do this (would generate I/O) */
1518	if (mp->m_flags & XFS_MOUNT_RDONLY)
1519		return 0;
1520
1521	if (!XFS_FORCED_SHUTDOWN(mp)) {
1522		int truncated;
1523
1524		/*
1525		 * If we are using filestreams, and we have an unlinked
1526		 * file that we are processing the last close on, then nothing
1527		 * will be able to reopen and write to this file. Purge this
1528		 * inode from the filestreams cache so that it doesn't delay
1529		 * teardown of the inode.
1530		 */
1531		if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1532			xfs_filestream_deassociate(ip);
1533
1534		/*
1535		 * If we previously truncated this file and removed old data
1536		 * in the process, we want to initiate "early" writeout on
1537		 * the last close.  This is an attempt to combat the notorious
1538		 * NULL files problem which is particularly noticable from a
1539		 * truncate down, buffered (re-)write (delalloc), followed by
1540		 * a crash.  What we are effectively doing here is
1541		 * significantly reducing the time window where we'd otherwise
1542		 * be exposed to that problem.
1543		 */
1544		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1545		if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1546			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1547	}
1548
1549#ifdef HAVE_REFCACHE
1550	/* If we are in the NFS reference cache then don't do this now */
1551	if (ip->i_refcache)
1552		return 0;
1553#endif
1554
1555	if (ip->i_d.di_nlink != 0) {
1556		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1557		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1558		       ip->i_delayed_blks > 0)) &&
1559		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1560		    (!(ip->i_d.di_flags &
1561				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1562			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1563			if (error)
1564				return error;
1565			/* Update linux inode block count after free above */
1566			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1567				ip->i_d.di_nblocks + ip->i_delayed_blks);
1568		}
1569	}
1570
1571	return 0;
1572}
1573
1574/*
1575 * xfs_inactive
1576 *
1577 * This is called when the vnode reference count for the vnode
1578 * goes to zero.  If the file has been unlinked, then it must
1579 * now be truncated.  Also, we clear all of the read-ahead state
1580 * kept for the inode here since the file is now closed.
1581 */
1582int
1583xfs_inactive(
1584	xfs_inode_t	*ip)
1585{
1586	bhv_vnode_t	*vp = XFS_ITOV(ip);
1587	xfs_bmap_free_t	free_list;
1588	xfs_fsblock_t	first_block;
1589	int		committed;
1590	xfs_trans_t	*tp;
1591	xfs_mount_t	*mp;
1592	int		error;
1593	int		truncate;
1594
1595	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
1596
1597	/*
1598	 * If the inode is already free, then there can be nothing
1599	 * to clean up here.
1600	 */
1601	if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1602		ASSERT(ip->i_df.if_real_bytes == 0);
1603		ASSERT(ip->i_df.if_broot_bytes == 0);
1604		return VN_INACTIVE_CACHE;
1605	}
1606
1607	/*
1608	 * Only do a truncate if it's a regular file with
1609	 * some actual space in it.  It's OK to look at the
1610	 * inode's fields without the lock because we're the
1611	 * only one with a reference to the inode.
1612	 */
1613	truncate = ((ip->i_d.di_nlink == 0) &&
1614	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1615	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1616	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1617
1618	mp = ip->i_mount;
1619
1620	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1621		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1622	}
1623
1624	error = 0;
1625
1626	/* If this is a read-only mount, don't do this (would generate I/O) */
1627	if (mp->m_flags & XFS_MOUNT_RDONLY)
1628		goto out;
1629
1630	if (ip->i_d.di_nlink != 0) {
1631		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1632                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1633                       ip->i_delayed_blks > 0)) &&
1634		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1635		     (!(ip->i_d.di_flags &
1636				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1637		      (ip->i_delayed_blks != 0)))) {
1638			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1639			if (error)
1640				return VN_INACTIVE_CACHE;
1641			/* Update linux inode block count after free above */
1642			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1643				ip->i_d.di_nblocks + ip->i_delayed_blks);
1644		}
1645		goto out;
1646	}
1647
1648	ASSERT(ip->i_d.di_nlink == 0);
1649
1650	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1651		return VN_INACTIVE_CACHE;
1652
1653	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1654	if (truncate) {
1655		/*
1656		 * Do the xfs_itruncate_start() call before
1657		 * reserving any log space because itruncate_start
1658		 * will call into the buffer cache and we can't
1659		 * do that within a transaction.
1660		 */
1661		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1662
1663		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1664		if (error) {
1665			xfs_trans_cancel(tp, 0);
1666			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1667			return VN_INACTIVE_CACHE;
1668		}
1669
1670		error = xfs_trans_reserve(tp, 0,
1671					  XFS_ITRUNCATE_LOG_RES(mp),
1672					  0, XFS_TRANS_PERM_LOG_RES,
1673					  XFS_ITRUNCATE_LOG_COUNT);
1674		if (error) {
1675			/* Don't call itruncate_cleanup */
1676			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1677			xfs_trans_cancel(tp, 0);
1678			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1679			return VN_INACTIVE_CACHE;
1680		}
1681
1682		xfs_ilock(ip, XFS_ILOCK_EXCL);
1683		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1684		xfs_trans_ihold(tp, ip);
1685
1686		/*
1687		 * normally, we have to run xfs_itruncate_finish sync.
1688		 * But if filesystem is wsync and we're in the inactive
1689		 * path, then we know that nlink == 0, and that the
1690		 * xaction that made nlink == 0 is permanently committed
1691		 * since xfs_remove runs as a synchronous transaction.
1692		 */
1693		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1694				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1695
1696		if (error) {
1697			xfs_trans_cancel(tp,
1698				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1699			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1700			return VN_INACTIVE_CACHE;
1701		}
1702	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1703
1704		/*
1705		 * If we get an error while cleaning up a
1706		 * symlink we bail out.
1707		 */
1708		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1709			xfs_inactive_symlink_rmt(ip, &tp) :
1710			xfs_inactive_symlink_local(ip, &tp);
1711
1712		if (error) {
1713			ASSERT(tp == NULL);
1714			return VN_INACTIVE_CACHE;
1715		}
1716
1717		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1718		xfs_trans_ihold(tp, ip);
1719	} else {
1720		error = xfs_trans_reserve(tp, 0,
1721					  XFS_IFREE_LOG_RES(mp),
1722					  0, XFS_TRANS_PERM_LOG_RES,
1723					  XFS_INACTIVE_LOG_COUNT);
1724		if (error) {
1725			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1726			xfs_trans_cancel(tp, 0);
1727			return VN_INACTIVE_CACHE;
1728		}
1729
1730		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1731		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1732		xfs_trans_ihold(tp, ip);
1733	}
1734
1735	/*
1736	 * If there are attributes associated with the file
1737	 * then blow them away now.  The code calls a routine
1738	 * that recursively deconstructs the attribute fork.
1739	 * We need to just commit the current transaction
1740	 * because we can't use it for xfs_attr_inactive().
1741	 */
1742	if (ip->i_d.di_anextents > 0) {
1743		error = xfs_inactive_attrs(ip, &tp);
1744		/*
1745		 * If we got an error, the transaction is already
1746		 * cancelled, and the inode is unlocked. Just get out.
1747		 */
1748		 if (error)
1749			 return VN_INACTIVE_CACHE;
1750	} else if (ip->i_afp) {
1751		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1752	}
1753
1754	/*
1755	 * Free the inode.
1756	 */
1757	XFS_BMAP_INIT(&free_list, &first_block);
1758	error = xfs_ifree(tp, ip, &free_list);
1759	if (error) {
1760		/*
1761		 * If we fail to free the inode, shut down.  The cancel
1762		 * might do that, we need to make sure.  Otherwise the
1763		 * inode might be lost for a long time or forever.
1764		 */
1765		if (!XFS_FORCED_SHUTDOWN(mp)) {
1766			cmn_err(CE_NOTE,
1767		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
1768				error, mp->m_fsname);
1769			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1770		}
1771		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1772	} else {
1773		/*
1774		 * Credit the quota account(s). The inode is gone.
1775		 */
1776		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1777
1778		/*
1779		 * Just ignore errors at this point.  There is
1780		 * nothing we can do except to try to keep going.
1781		 */
1782		(void) xfs_bmap_finish(&tp,  &free_list, &committed);
1783		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1784	}
1785	/*
1786	 * Release the dquots held by inode, if any.
1787	 */
1788	XFS_QM_DQDETACH(mp, ip);
1789
1790	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1791
1792 out:
1793	return VN_INACTIVE_CACHE;
1794}
1795
1796
1797int
1798xfs_lookup(
1799	xfs_inode_t		*dp,
1800	bhv_vname_t		*dentry,
1801	bhv_vnode_t		**vpp)
1802{
1803	xfs_inode_t		*ip;
1804	xfs_ino_t		e_inum;
1805	int			error;
1806	uint			lock_mode;
1807
1808	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
1809
1810	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1811		return XFS_ERROR(EIO);
1812
1813	lock_mode = xfs_ilock_map_shared(dp);
1814	error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
1815	if (!error) {
1816		*vpp = XFS_ITOV(ip);
1817		ITRACE(ip);
1818	}
1819	xfs_iunlock_map_shared(dp, lock_mode);
1820	return error;
1821}
1822
1823int
1824xfs_create(
1825	xfs_inode_t		*dp,
1826	bhv_vname_t		*dentry,
1827	mode_t			mode,
1828	xfs_dev_t		rdev,
1829	bhv_vnode_t		**vpp,
1830	cred_t			*credp)
1831{
1832	char			*name = VNAME(dentry);
1833	xfs_mount_t	        *mp = dp->i_mount;
1834	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
1835	xfs_inode_t		*ip;
1836	bhv_vnode_t	        *vp = NULL;
1837	xfs_trans_t		*tp;
1838	int                     error;
1839	xfs_bmap_free_t		free_list;
1840	xfs_fsblock_t		first_block;
1841	boolean_t		unlock_dp_on_error = B_FALSE;
1842	int			dm_event_sent = 0;
1843	uint			cancel_flags;
1844	int			committed;
1845	xfs_prid_t		prid;
1846	struct xfs_dquot	*udqp, *gdqp;
1847	uint			resblks;
1848	int			namelen;
1849
1850	ASSERT(!*vpp);
1851	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
1852
1853	namelen = VNAMELEN(dentry);
1854
1855	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1856		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1857				dir_vp, DM_RIGHT_NULL, NULL,
1858				DM_RIGHT_NULL, name, NULL,
1859				mode, 0, 0);
1860
1861		if (error)
1862			return error;
1863		dm_event_sent = 1;
1864	}
1865
1866	if (XFS_FORCED_SHUTDOWN(mp))
1867		return XFS_ERROR(EIO);
1868
1869	/* Return through std_return after this point. */
1870
1871	udqp = gdqp = NULL;
1872	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1873		prid = dp->i_d.di_projid;
1874	else
1875		prid = (xfs_prid_t)dfltprid;
1876
1877	/*
1878	 * Make sure that we have allocated dquot(s) on disk.
1879	 */
1880	error = XFS_QM_DQVOPALLOC(mp, dp,
1881			current_fsuid(credp), current_fsgid(credp), prid,
1882			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1883	if (error)
1884		goto std_return;
1885
1886	ip = NULL;
1887
1888	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1889	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1890	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1891	/*
1892	 * Initially assume that the file does not exist and
1893	 * reserve the resources for that case.  If that is not
1894	 * the case we'll drop the one we have and get a more
1895	 * appropriate transaction later.
1896	 */
1897	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1898			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1899	if (error == ENOSPC) {
1900		resblks = 0;
1901		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1902				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1903	}
1904	if (error) {
1905		cancel_flags = 0;
1906		goto error_return;
1907	}
1908
1909	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1910	unlock_dp_on_error = B_TRUE;
1911
1912	XFS_BMAP_INIT(&free_list, &first_block);
1913
1914	ASSERT(ip == NULL);
1915
1916	/*
1917	 * Reserve disk quota and the inode.
1918	 */
1919	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1920	if (error)
1921		goto error_return;
1922
1923	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1924		goto error_return;
1925	error = xfs_dir_ialloc(&tp, dp, mode, 1,
1926			rdev, credp, prid, resblks > 0,
1927			&ip, &committed);
1928	if (error) {
1929		if (error == ENOSPC)
1930			goto error_return;
1931		goto abort_return;
1932	}
1933	ITRACE(ip);
1934
1935	/*
1936	 * At this point, we've gotten a newly allocated inode.
1937	 * It is locked (and joined to the transaction).
1938	 */
1939
1940	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1941
1942	/*
1943	 * Now we join the directory inode to the transaction.  We do not do it
1944	 * earlier because xfs_dir_ialloc might commit the previous transaction
1945	 * (and release all the locks).  An error from here on will result in
1946	 * the transaction cancel unlocking dp so don't do it explicitly in the
1947	 * error path.
1948	 */
1949	VN_HOLD(dir_vp);
1950	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1951	unlock_dp_on_error = B_FALSE;
1952
1953	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
1954					&first_block, &free_list, resblks ?
1955					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1956	if (error) {
1957		ASSERT(error != ENOSPC);
1958		goto abort_return;
1959	}
1960	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1961	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1962
1963	/*
1964	 * If this is a synchronous mount, make sure that the
1965	 * create transaction goes to disk before returning to
1966	 * the user.
1967	 */
1968	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1969		xfs_trans_set_sync(tp);
1970	}
1971
1972	dp->i_gen++;
1973
1974	/*
1975	 * Attach the dquot(s) to the inodes and modify them incore.
1976	 * These ids of the inode couldn't have changed since the new
1977	 * inode has been locked ever since it was created.
1978	 */
1979	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1980
1981	/*
1982	 * xfs_trans_commit normally decrements the vnode ref count
1983	 * when it unlocks the inode. Since we want to return the
1984	 * vnode to the caller, we bump the vnode ref count now.
1985	 */
1986	IHOLD(ip);
1987	vp = XFS_ITOV(ip);
1988
1989	error = xfs_bmap_finish(&tp, &free_list, &committed);
1990	if (error) {
1991		xfs_bmap_cancel(&free_list);
1992		goto abort_rele;
1993	}
1994
1995	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1996	if (error) {
1997		IRELE(ip);
1998		tp = NULL;
1999		goto error_return;
2000	}
2001
2002	XFS_QM_DQRELE(mp, udqp);
2003	XFS_QM_DQRELE(mp, gdqp);
2004
2005	*vpp = vp;
2006
2007	/* Fallthrough to std_return with error = 0  */
2008
2009std_return:
2010	if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
2011	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2012		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2013			dir_vp, DM_RIGHT_NULL,
2014			*vpp ? vp:NULL,
2015			DM_RIGHT_NULL, name, NULL,
2016			mode, error, 0);
2017	}
2018	return error;
2019
2020 abort_return:
2021	cancel_flags |= XFS_TRANS_ABORT;
2022	/* FALLTHROUGH */
2023
2024 error_return:
2025	if (tp != NULL)
2026		xfs_trans_cancel(tp, cancel_flags);
2027
2028	XFS_QM_DQRELE(mp, udqp);
2029	XFS_QM_DQRELE(mp, gdqp);
2030
2031	if (unlock_dp_on_error)
2032		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2033
2034	goto std_return;
2035
2036 abort_rele:
2037	/*
2038	 * Wait until after the current transaction is aborted to
2039	 * release the inode.  This prevents recursive transactions
2040	 * and deadlocks from xfs_inactive.
2041	 */
2042	cancel_flags |= XFS_TRANS_ABORT;
2043	xfs_trans_cancel(tp, cancel_flags);
2044	IRELE(ip);
2045
2046	XFS_QM_DQRELE(mp, udqp);
2047	XFS_QM_DQRELE(mp, gdqp);
2048
2049	goto std_return;
2050}
2051
2052#ifdef DEBUG
2053/*
2054 * Some counters to see if (and how often) we are hitting some deadlock
2055 * prevention code paths.
2056 */
2057
2058int xfs_rm_locks;
2059int xfs_rm_lock_delays;
2060int xfs_rm_attempts;
2061#endif
2062
2063/*
2064 * The following routine will lock the inodes associated with the
2065 * directory and the named entry in the directory. The locks are
2066 * acquired in increasing inode number.
2067 *
2068 * If the entry is "..", then only the directory is locked. The
2069 * vnode ref count will still include that from the .. entry in
2070 * this case.
2071 *
2072 * There is a deadlock we need to worry about. If the locked directory is
2073 * in the AIL, it might be blocking up the log. The next inode we lock
2074 * could be already locked by another thread waiting for log space (e.g
2075 * a permanent log reservation with a long running transaction (see
2076 * xfs_itruncate_finish)). To solve this, we must check if the directory
2077 * is in the ail and use lock_nowait. If we can't lock, we need to
2078 * drop the inode lock on the directory and try again. xfs_iunlock will
2079 * potentially push the tail if we were holding up the log.
2080 */
2081STATIC int
2082xfs_lock_dir_and_entry(
2083	xfs_inode_t	*dp,
2084	xfs_inode_t	*ip)	/* inode of entry 'name' */
2085{
2086	int		attempts;
2087	xfs_ino_t	e_inum;
2088	xfs_inode_t	*ips[2];
2089	xfs_log_item_t	*lp;
2090
2091#ifdef DEBUG
2092	xfs_rm_locks++;
2093#endif
2094	attempts = 0;
2095
2096again:
2097	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2098
2099	e_inum = ip->i_ino;
2100
2101	ITRACE(ip);
2102
2103	/*
2104	 * We want to lock in increasing inum. Since we've already
2105	 * acquired the lock on the directory, we may need to release
2106	 * if if the inum of the entry turns out to be less.
2107	 */
2108	if (e_inum > dp->i_ino) {
2109		/*
2110		 * We are already in the right order, so just
2111		 * lock on the inode of the entry.
2112		 * We need to use nowait if dp is in the AIL.
2113		 */
2114
2115		lp = (xfs_log_item_t *)dp->i_itemp;
2116		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2117			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2118				attempts++;
2119#ifdef DEBUG
2120				xfs_rm_attempts++;
2121#endif
2122
2123				/*
2124				 * Unlock dp and try again.
2125				 * xfs_iunlock will try to push the tail
2126				 * if the inode is in the AIL.
2127				 */
2128
2129				xfs_iunlock(dp, XFS_ILOCK_EXCL);
2130
2131				if ((attempts % 5) == 0) {
2132					delay(1); /* Don't just spin the CPU */
2133#ifdef DEBUG
2134					xfs_rm_lock_delays++;
2135#endif
2136				}
2137				goto again;
2138			}
2139		} else {
2140			xfs_ilock(ip, XFS_ILOCK_EXCL);
2141		}
2142	} else if (e_inum < dp->i_ino) {
2143		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2144
2145		ips[0] = ip;
2146		ips[1] = dp;
2147		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2148	}
2149	/* else	 e_inum == dp->i_ino */
2150	/*     This can happen if we're asked to lock /x/..
2151	 *     the entry is "..", which is also the parent directory.
2152	 */
2153
2154	return 0;
2155}
2156
2157#ifdef DEBUG
2158int xfs_locked_n;
2159int xfs_small_retries;
2160int xfs_middle_retries;
2161int xfs_lots_retries;
2162int xfs_lock_delays;
2163#endif
2164
2165/*
2166 * Bump the subclass so xfs_lock_inodes() acquires each lock with
2167 * a different value
2168 */
2169static inline int
2170xfs_lock_inumorder(int lock_mode, int subclass)
2171{
2172	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2173		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2174	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2175		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2176
2177	return lock_mode;
2178}
2179
2180/*
2181 * The following routine will lock n inodes in exclusive mode.
2182 * We assume the caller calls us with the inodes in i_ino order.
2183 *
2184 * We need to detect deadlock where an inode that we lock
2185 * is in the AIL and we start waiting for another inode that is locked
2186 * by a thread in a long running transaction (such as truncate). This can
2187 * result in deadlock since the long running trans might need to wait
2188 * for the inode we just locked in order to push the tail and free space
2189 * in the log.
2190 */
2191void
2192xfs_lock_inodes(
2193	xfs_inode_t	**ips,
2194	int		inodes,
2195	int		first_locked,
2196	uint		lock_mode)
2197{
2198	int		attempts = 0, i, j, try_lock;
2199	xfs_log_item_t	*lp;
2200
2201	ASSERT(ips && (inodes >= 2)); /* we need at least two */
2202
2203	if (first_locked) {
2204		try_lock = 1;
2205		i = 1;
2206	} else {
2207		try_lock = 0;
2208		i = 0;
2209	}
2210
2211again:
2212	for (; i < inodes; i++) {
2213		ASSERT(ips[i]);
2214
2215		if (i && (ips[i] == ips[i-1]))	/* Already locked */
2216			continue;
2217
2218		/*
2219		 * If try_lock is not set yet, make sure all locked inodes
2220		 * are not in the AIL.
2221		 * If any are, set try_lock to be used later.
2222		 */
2223
2224		if (!try_lock) {
2225			for (j = (i - 1); j >= 0 && !try_lock; j--) {
2226				lp = (xfs_log_item_t *)ips[j]->i_itemp;
2227				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2228					try_lock++;
2229				}
2230			}
2231		}
2232
2233		/*
2234		 * If any of the previous locks we have locked is in the AIL,
2235		 * we must TRY to get the second and subsequent locks. If
2236		 * we can't get any, we must release all we have
2237		 * and try again.
2238		 */
2239
2240		if (try_lock) {
2241			/* try_lock must be 0 if i is 0. */
2242			/*
2243			 * try_lock means we have an inode locked
2244			 * that is in the AIL.
2245			 */
2246			ASSERT(i != 0);
2247			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2248				attempts++;
2249
2250				/*
2251				 * Unlock all previous guys and try again.
2252				 * xfs_iunlock will try to push the tail
2253				 * if the inode is in the AIL.
2254				 */
2255
2256				for(j = i - 1; j >= 0; j--) {
2257
2258					/*
2259					 * Check to see if we've already
2260					 * unlocked this one.
2261					 * Not the first one going back,
2262					 * and the inode ptr is the same.
2263					 */
2264					if ((j != (i - 1)) && ips[j] ==
2265								ips[j+1])
2266						continue;
2267
2268					xfs_iunlock(ips[j], lock_mode);
2269				}
2270
2271				if ((attempts % 5) == 0) {
2272					delay(1); /* Don't just spin the CPU */
2273#ifdef DEBUG
2274					xfs_lock_delays++;
2275#endif
2276				}
2277				i = 0;
2278				try_lock = 0;
2279				goto again;
2280			}
2281		} else {
2282			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2283		}
2284	}
2285
2286#ifdef DEBUG
2287	if (attempts) {
2288		if (attempts < 5) xfs_small_retries++;
2289		else if (attempts < 100) xfs_middle_retries++;
2290		else xfs_lots_retries++;
2291	} else {
2292		xfs_locked_n++;
2293	}
2294#endif
2295}
2296
2297#ifdef	DEBUG
2298#define	REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
2299int remove_which_error_return = 0;
2300#else /* ! DEBUG */
2301#define	REMOVE_DEBUG_TRACE(x)
2302#endif	/* ! DEBUG */
2303
2304int
2305xfs_remove(
2306	xfs_inode_t             *dp,
2307	bhv_vname_t		*dentry)
2308{
2309	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
2310	char			*name = VNAME(dentry);
2311	xfs_mount_t		*mp = dp->i_mount;
2312	xfs_inode_t             *ip;
2313	xfs_trans_t             *tp = NULL;
2314	int                     error = 0;
2315	xfs_bmap_free_t         free_list;
2316	xfs_fsblock_t           first_block;
2317	int			cancel_flags;
2318	int			committed;
2319	int			dm_di_mode = 0;
2320	int			link_zero;
2321	uint			resblks;
2322	int			namelen;
2323
2324	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2325
2326	if (XFS_FORCED_SHUTDOWN(mp))
2327		return XFS_ERROR(EIO);
2328
2329	namelen = VNAMELEN(dentry);
2330
2331	if (!xfs_get_dir_entry(dentry, &ip)) {
2332	        dm_di_mode = ip->i_d.di_mode;
2333		IRELE(ip);
2334	}
2335
2336	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2337		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2338					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2339					name, NULL, dm_di_mode, 0, 0);
2340		if (error)
2341			return error;
2342	}
2343
2344	/* From this point on, return through std_return */
2345	ip = NULL;
2346
2347	/*
2348	 * We need to get a reference to ip before we get our log
2349	 * reservation. The reason for this is that we cannot call
2350	 * xfs_iget for an inode for which we do not have a reference
2351	 * once we've acquired a log reservation. This is because the
2352	 * inode we are trying to get might be in xfs_inactive going
2353	 * for a log reservation. Since we'll have to wait for the
2354	 * inactive code to complete before returning from xfs_iget,
2355	 * we need to make sure that we don't have log space reserved
2356	 * when we call xfs_iget.  Instead we get an unlocked reference
2357	 * to the inode before getting our log reservation.
2358	 */
2359	error = xfs_get_dir_entry(dentry, &ip);
2360	if (error) {
2361		REMOVE_DEBUG_TRACE(__LINE__);
2362		goto std_return;
2363	}
2364
2365	dm_di_mode = ip->i_d.di_mode;
2366
2367	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
2368
2369	ITRACE(ip);
2370
2371	error = XFS_QM_DQATTACH(mp, dp, 0);
2372	if (!error && dp != ip)
2373		error = XFS_QM_DQATTACH(mp, ip, 0);
2374	if (error) {
2375		REMOVE_DEBUG_TRACE(__LINE__);
2376		IRELE(ip);
2377		goto std_return;
2378	}
2379
2380	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2381	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2382	/*
2383	 * We try to get the real space reservation first,
2384	 * allowing for directory btree deletion(s) implying
2385	 * possible bmap insert(s).  If we can't get the space
2386	 * reservation then we use 0 instead, and avoid the bmap
2387	 * btree insert(s) in the directory code by, if the bmap
2388	 * insert tries to happen, instead trimming the LAST
2389	 * block from the directory.
2390	 */
2391	resblks = XFS_REMOVE_SPACE_RES(mp);
2392	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2393			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2394	if (error == ENOSPC) {
2395		resblks = 0;
2396		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2397				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2398	}
2399	if (error) {
2400		ASSERT(error != ENOSPC);
2401		REMOVE_DEBUG_TRACE(__LINE__);
2402		xfs_trans_cancel(tp, 0);
2403		IRELE(ip);
2404		return error;
2405	}
2406
2407	error = xfs_lock_dir_and_entry(dp, ip);
2408	if (error) {
2409		REMOVE_DEBUG_TRACE(__LINE__);
2410		xfs_trans_cancel(tp, cancel_flags);
2411		IRELE(ip);
2412		goto std_return;
2413	}
2414
2415	/*
2416	 * At this point, we've gotten both the directory and the entry
2417	 * inodes locked.
2418	 */
2419	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2420	if (dp != ip) {
2421		/*
2422		 * Increment vnode ref count only in this case since
2423		 * there's an extra vnode reference in the case where
2424		 * dp == ip.
2425		 */
2426		IHOLD(dp);
2427		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2428	}
2429
2430	/*
2431	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2432	 */
2433	XFS_BMAP_INIT(&free_list, &first_block);
2434	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2435					&first_block, &free_list, 0);
2436	if (error) {
2437		ASSERT(error != ENOENT);
2438		REMOVE_DEBUG_TRACE(__LINE__);
2439		goto error1;
2440	}
2441	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2442
2443	dp->i_gen++;
2444	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2445
2446	error = xfs_droplink(tp, ip);
2447	if (error) {
2448		REMOVE_DEBUG_TRACE(__LINE__);
2449		goto error1;
2450	}
2451
2452	/* Determine if this is the last link while
2453	 * we are in the transaction.
2454	 */
2455	link_zero = (ip)->i_d.di_nlink==0;
2456
2457	/*
2458	 * Take an extra ref on the inode so that it doesn't
2459	 * go to xfs_inactive() from within the commit.
2460	 */
2461	IHOLD(ip);
2462
2463	/*
2464	 * If this is a synchronous mount, make sure that the
2465	 * remove transaction goes to disk before returning to
2466	 * the user.
2467	 */
2468	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2469		xfs_trans_set_sync(tp);
2470	}
2471
2472	error = xfs_bmap_finish(&tp, &free_list, &committed);
2473	if (error) {
2474		REMOVE_DEBUG_TRACE(__LINE__);
2475		goto error_rele;
2476	}
2477
2478	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2479	if (error) {
2480		IRELE(ip);
2481		goto std_return;
2482	}
2483
2484	/*
2485	 * Before we drop our extra reference to the inode, purge it
2486	 * from the refcache if it is there.  By waiting until afterwards
2487	 * to do the IRELE, we ensure that we won't go inactive in the
2488	 * xfs_refcache_purge_ip routine (although that would be OK).
2489	 */
2490	xfs_refcache_purge_ip(ip);
2491
2492	/*
2493	 * If we are using filestreams, kill the stream association.
2494	 * If the file is still open it may get a new one but that
2495	 * will get killed on last close in xfs_close() so we don't
2496	 * have to worry about that.
2497	 */
2498	if (link_zero && xfs_inode_is_filestream(ip))
2499		xfs_filestream_deassociate(ip);
2500
2501	vn_trace_exit(ip, __FUNCTION__, (inst_t *)__return_address);
2502
2503	IRELE(ip);
2504
2505/*	Fall through to std_return with error = 0 */
2506 std_return:
2507	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2508		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2509				dir_vp, DM_RIGHT_NULL,
2510				NULL, DM_RIGHT_NULL,
2511				name, NULL, dm_di_mode, error, 0);
2512	}
2513	return error;
2514
2515 error1:
2516	xfs_bmap_cancel(&free_list);
2517	cancel_flags |= XFS_TRANS_ABORT;
2518	xfs_trans_cancel(tp, cancel_flags);
2519	goto std_return;
2520
2521 error_rele:
2522	/*
2523	 * In this case make sure to not release the inode until after
2524	 * the current transaction is aborted.  Releasing it beforehand
2525	 * can cause us to go to xfs_inactive and start a recursive
2526	 * transaction which can easily deadlock with the current one.
2527	 */
2528	xfs_bmap_cancel(&free_list);
2529	cancel_flags |= XFS_TRANS_ABORT;
2530	xfs_trans_cancel(tp, cancel_flags);
2531
2532	/*
2533	 * Before we drop our extra reference to the inode, purge it
2534	 * from the refcache if it is there.  By waiting until afterwards
2535	 * to do the IRELE, we ensure that we won't go inactive in the
2536	 * xfs_refcache_purge_ip routine (although that would be OK).
2537	 */
2538	xfs_refcache_purge_ip(ip);
2539
2540	IRELE(ip);
2541
2542	goto std_return;
2543}
2544
2545int
2546xfs_link(
2547	xfs_inode_t		*tdp,
2548	bhv_vnode_t		*src_vp,
2549	bhv_vname_t		*dentry)
2550{
2551	bhv_vnode_t		*target_dir_vp = XFS_ITOV(tdp);
2552	xfs_mount_t		*mp = tdp->i_mount;
2553	xfs_inode_t		*sip = xfs_vtoi(src_vp);
2554	xfs_trans_t		*tp;
2555	xfs_inode_t		*ips[2];
2556	int			error;
2557	xfs_bmap_free_t         free_list;
2558	xfs_fsblock_t           first_block;
2559	int			cancel_flags;
2560	int			committed;
2561	int			resblks;
2562	char			*target_name = VNAME(dentry);
2563	int			target_namelen;
2564
2565	vn_trace_entry(tdp, __FUNCTION__, (inst_t *)__return_address);
2566	vn_trace_entry(xfs_vtoi(src_vp), __FUNCTION__, (inst_t *)__return_address);
2567
2568	target_namelen = VNAMELEN(dentry);
2569	ASSERT(!VN_ISDIR(src_vp));
2570
2571	if (XFS_FORCED_SHUTDOWN(mp))
2572		return XFS_ERROR(EIO);
2573
2574	if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2575		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2576					target_dir_vp, DM_RIGHT_NULL,
2577					src_vp, DM_RIGHT_NULL,
2578					target_name, NULL, 0, 0, 0);
2579		if (error)
2580			return error;
2581	}
2582
2583	/* Return through std_return after this point. */
2584
2585	error = XFS_QM_DQATTACH(mp, sip, 0);
2586	if (!error && sip != tdp)
2587		error = XFS_QM_DQATTACH(mp, tdp, 0);
2588	if (error)
2589		goto std_return;
2590
2591	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2592	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2593	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2594	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2595			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2596	if (error == ENOSPC) {
2597		resblks = 0;
2598		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2599				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2600	}
2601	if (error) {
2602		cancel_flags = 0;
2603		goto error_return;
2604	}
2605
2606	if (sip->i_ino < tdp->i_ino) {
2607		ips[0] = sip;
2608		ips[1] = tdp;
2609	} else {
2610		ips[0] = tdp;
2611		ips[1] = sip;
2612	}
2613
2614	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2615
2616	/*
2617	 * Increment vnode ref counts since xfs_trans_commit &
2618	 * xfs_trans_cancel will both unlock the inodes and
2619	 * decrement the associated ref counts.
2620	 */
2621	VN_HOLD(src_vp);
2622	VN_HOLD(target_dir_vp);
2623	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2624	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2625
2626	/*
2627	 * If the source has too many links, we can't make any more to it.
2628	 */
2629	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2630		error = XFS_ERROR(EMLINK);
2631		goto error_return;
2632	}
2633
2634	/*
2635	 * If we are using project inheritance, we only allow hard link
2636	 * creation in our tree when the project IDs are the same; else
2637	 * the tree quota mechanism could be circumvented.
2638	 */
2639	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2640		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2641		error = XFS_ERROR(EXDEV);
2642		goto error_return;
2643	}
2644
2645	if (resblks == 0 &&
2646	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2647		goto error_return;
2648
2649	XFS_BMAP_INIT(&free_list, &first_block);
2650
2651	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2652				   sip->i_ino, &first_block, &free_list,
2653				   resblks);
2654	if (error)
2655		goto abort_return;
2656	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2657	tdp->i_gen++;
2658	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2659
2660	error = xfs_bumplink(tp, sip);
2661	if (error)
2662		goto abort_return;
2663
2664	/*
2665	 * If this is a synchronous mount, make sure that the
2666	 * link transaction goes to disk before returning to
2667	 * the user.
2668	 */
2669	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2670		xfs_trans_set_sync(tp);
2671	}
2672
2673	error = xfs_bmap_finish (&tp, &free_list, &committed);
2674	if (error) {
2675		xfs_bmap_cancel(&free_list);
2676		goto abort_return;
2677	}
2678
2679	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2680	if (error)
2681		goto std_return;
2682
2683	/* Fall through to std_return with error = 0. */
2684std_return:
2685	if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2686		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2687				target_dir_vp, DM_RIGHT_NULL,
2688				src_vp, DM_RIGHT_NULL,
2689				target_name, NULL, 0, error, 0);
2690	}
2691	return error;
2692
2693 abort_return:
2694	cancel_flags |= XFS_TRANS_ABORT;
2695	/* FALLTHROUGH */
2696
2697 error_return:
2698	xfs_trans_cancel(tp, cancel_flags);
2699	goto std_return;
2700}
2701
2702
2703int
2704xfs_mkdir(
2705	xfs_inode_t             *dp,
2706	bhv_vname_t		*dentry,
2707	mode_t			mode,
2708	bhv_vnode_t		**vpp,
2709	cred_t			*credp)
2710{
2711	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
2712	char			*dir_name = VNAME(dentry);
2713	int			dir_namelen = VNAMELEN(dentry);
2714	xfs_mount_t		*mp = dp->i_mount;
2715	xfs_inode_t		*cdp;	/* inode of created dir */
2716	bhv_vnode_t		*cvp;	/* vnode of created dir */
2717	xfs_trans_t		*tp;
2718	int			cancel_flags;
2719	int			error;
2720	int			committed;
2721	xfs_bmap_free_t         free_list;
2722	xfs_fsblock_t           first_block;
2723	boolean_t		unlock_dp_on_error = B_FALSE;
2724	boolean_t		created = B_FALSE;
2725	int			dm_event_sent = 0;
2726	xfs_prid_t		prid;
2727	struct xfs_dquot	*udqp, *gdqp;
2728	uint			resblks;
2729
2730	if (XFS_FORCED_SHUTDOWN(mp))
2731		return XFS_ERROR(EIO);
2732
2733	tp = NULL;
2734
2735	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2736		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2737					dir_vp, DM_RIGHT_NULL, NULL,
2738					DM_RIGHT_NULL, dir_name, NULL,
2739					mode, 0, 0);
2740		if (error)
2741			return error;
2742		dm_event_sent = 1;
2743	}
2744
2745	/* Return through std_return after this point. */
2746
2747	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2748
2749	mp = dp->i_mount;
2750	udqp = gdqp = NULL;
2751	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2752		prid = dp->i_d.di_projid;
2753	else
2754		prid = (xfs_prid_t)dfltprid;
2755
2756	/*
2757	 * Make sure that we have allocated dquot(s) on disk.
2758	 */
2759	error = XFS_QM_DQVOPALLOC(mp, dp,
2760			current_fsuid(credp), current_fsgid(credp), prid,
2761			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2762	if (error)
2763		goto std_return;
2764
2765	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2766	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2767	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2768	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2769				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2770	if (error == ENOSPC) {
2771		resblks = 0;
2772		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2773					  XFS_TRANS_PERM_LOG_RES,
2774					  XFS_MKDIR_LOG_COUNT);
2775	}
2776	if (error) {
2777		cancel_flags = 0;
2778		goto error_return;
2779	}
2780
2781	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2782	unlock_dp_on_error = B_TRUE;
2783
2784	/*
2785	 * Check for directory link count overflow.
2786	 */
2787	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2788		error = XFS_ERROR(EMLINK);
2789		goto error_return;
2790	}
2791
2792	/*
2793	 * Reserve disk quota and the inode.
2794	 */
2795	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2796	if (error)
2797		goto error_return;
2798
2799	if (resblks == 0 &&
2800	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2801		goto error_return;
2802	/*
2803	 * create the directory inode.
2804	 */
2805	error = xfs_dir_ialloc(&tp, dp, mode, 2,
2806			0, credp, prid, resblks > 0,
2807		&cdp, NULL);
2808	if (error) {
2809		if (error == ENOSPC)
2810			goto error_return;
2811		goto abort_return;
2812	}
2813	ITRACE(cdp);
2814
2815	/*
2816	 * Now we add the directory inode to the transaction.
2817	 * We waited until now since xfs_dir_ialloc might start
2818	 * a new transaction.  Had we joined the transaction
2819	 * earlier, the locks might have gotten released. An error
2820	 * from here on will result in the transaction cancel
2821	 * unlocking dp so don't do it explicitly in the error path.
2822	 */
2823	VN_HOLD(dir_vp);
2824	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2825	unlock_dp_on_error = B_FALSE;
2826
2827	XFS_BMAP_INIT(&free_list, &first_block);
2828
2829	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2830				   &first_block, &free_list, resblks ?
2831				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2832	if (error) {
2833		ASSERT(error != ENOSPC);
2834		goto error1;
2835	}
2836	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2837
2838	/*
2839	 * Bump the in memory version number of the parent directory
2840	 * so that other processes accessing it will recognize that
2841	 * the directory has changed.
2842	 */
2843	dp->i_gen++;
2844
2845	error = xfs_dir_init(tp, cdp, dp);
2846	if (error)
2847		goto error2;
2848
2849	cdp->i_gen = 1;
2850	error = xfs_bumplink(tp, dp);
2851	if (error)
2852		goto error2;
2853
2854	cvp = XFS_ITOV(cdp);
2855
2856	created = B_TRUE;
2857
2858	*vpp = cvp;
2859	IHOLD(cdp);
2860
2861	/*
2862	 * Attach the dquots to the new inode and modify the icount incore.
2863	 */
2864	XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2865
2866	/*
2867	 * If this is a synchronous mount, make sure that the
2868	 * mkdir transaction goes to disk before returning to
2869	 * the user.
2870	 */
2871	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2872		xfs_trans_set_sync(tp);
2873	}
2874
2875	error = xfs_bmap_finish(&tp, &free_list, &committed);
2876	if (error) {
2877		IRELE(cdp);
2878		goto error2;
2879	}
2880
2881	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2882	XFS_QM_DQRELE(mp, udqp);
2883	XFS_QM_DQRELE(mp, gdqp);
2884	if (error) {
2885		IRELE(cdp);
2886	}
2887
2888	/* Fall through to std_return with error = 0 or errno from
2889	 * xfs_trans_commit. */
2890
2891std_return:
2892	if ((created || (error != 0 && dm_event_sent != 0)) &&
2893	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2894		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2895					dir_vp, DM_RIGHT_NULL,
2896					created ? XFS_ITOV(cdp):NULL,
2897					DM_RIGHT_NULL,
2898					dir_name, NULL,
2899					mode, error, 0);
2900	}
2901	return error;
2902
2903 error2:
2904 error1:
2905	xfs_bmap_cancel(&free_list);
2906 abort_return:
2907	cancel_flags |= XFS_TRANS_ABORT;
2908 error_return:
2909	xfs_trans_cancel(tp, cancel_flags);
2910	XFS_QM_DQRELE(mp, udqp);
2911	XFS_QM_DQRELE(mp, gdqp);
2912
2913	if (unlock_dp_on_error)
2914		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2915
2916	goto std_return;
2917}
2918
2919int
2920xfs_rmdir(
2921	xfs_inode_t             *dp,
2922	bhv_vname_t		*dentry)
2923{
2924	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
2925	char			*name = VNAME(dentry);
2926	int			namelen = VNAMELEN(dentry);
2927	xfs_mount_t		*mp = dp->i_mount;
2928  	xfs_inode_t             *cdp;   /* child directory */
2929	xfs_trans_t             *tp;
2930	int                     error;
2931	xfs_bmap_free_t         free_list;
2932	xfs_fsblock_t           first_block;
2933	int			cancel_flags;
2934	int			committed;
2935	int			dm_di_mode = S_IFDIR;
2936	int			last_cdp_link;
2937	uint			resblks;
2938
2939	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
2940
2941	if (XFS_FORCED_SHUTDOWN(mp))
2942		return XFS_ERROR(EIO);
2943
2944	if (!xfs_get_dir_entry(dentry, &cdp)) {
2945	        dm_di_mode = cdp->i_d.di_mode;
2946		IRELE(cdp);
2947	}
2948
2949	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2950		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2951					dir_vp, DM_RIGHT_NULL,
2952					NULL, DM_RIGHT_NULL,
2953					name, NULL, dm_di_mode, 0, 0);
2954		if (error)
2955			return XFS_ERROR(error);
2956	}
2957
2958	/* Return through std_return after this point. */
2959
2960	cdp = NULL;
2961
2962	/*
2963	 * We need to get a reference to cdp before we get our log
2964	 * reservation.  The reason for this is that we cannot call
2965	 * xfs_iget for an inode for which we do not have a reference
2966	 * once we've acquired a log reservation.  This is because the
2967	 * inode we are trying to get might be in xfs_inactive going
2968	 * for a log reservation.  Since we'll have to wait for the
2969	 * inactive code to complete before returning from xfs_iget,
2970	 * we need to make sure that we don't have log space reserved
2971	 * when we call xfs_iget.  Instead we get an unlocked reference
2972	 * to the inode before getting our log reservation.
2973	 */
2974	error = xfs_get_dir_entry(dentry, &cdp);
2975	if (error) {
2976		REMOVE_DEBUG_TRACE(__LINE__);
2977		goto std_return;
2978	}
2979	mp = dp->i_mount;
2980	dm_di_mode = cdp->i_d.di_mode;
2981
2982	/*
2983	 * Get the dquots for the inodes.
2984	 */
2985	error = XFS_QM_DQATTACH(mp, dp, 0);
2986	if (!error && dp != cdp)
2987		error = XFS_QM_DQATTACH(mp, cdp, 0);
2988	if (error) {
2989		IRELE(cdp);
2990		REMOVE_DEBUG_TRACE(__LINE__);
2991		goto std_return;
2992	}
2993
2994	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2995	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2996	/*
2997	 * We try to get the real space reservation first,
2998	 * allowing for directory btree deletion(s) implying
2999	 * possible bmap insert(s).  If we can't get the space
3000	 * reservation then we use 0 instead, and avoid the bmap
3001	 * btree insert(s) in the directory code by, if the bmap
3002	 * insert tries to happen, instead trimming the LAST
3003	 * block from the directory.
3004	 */
3005	resblks = XFS_REMOVE_SPACE_RES(mp);
3006	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3007			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3008	if (error == ENOSPC) {
3009		resblks = 0;
3010		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3011				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3012	}
3013	if (error) {
3014		ASSERT(error != ENOSPC);
3015		cancel_flags = 0;
3016		IRELE(cdp);
3017		goto error_return;
3018	}
3019	XFS_BMAP_INIT(&free_list, &first_block);
3020
3021	/*
3022	 * Now lock the child directory inode and the parent directory
3023	 * inode in the proper order.  This will take care of validating
3024	 * that the directory entry for the child directory inode has
3025	 * not changed while we were obtaining a log reservation.
3026	 */
3027	error = xfs_lock_dir_and_entry(dp, cdp);
3028	if (error) {
3029		xfs_trans_cancel(tp, cancel_flags);
3030		IRELE(cdp);
3031		goto std_return;
3032	}
3033
3034	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3035	if (dp != cdp) {
3036		/*
3037		 * Only increment the parent directory vnode count if
3038		 * we didn't bump it in looking up cdp.  The only time
3039		 * we don't bump it is when we're looking up ".".
3040		 */
3041		VN_HOLD(dir_vp);
3042	}
3043
3044	ITRACE(cdp);
3045	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3046
3047	ASSERT(cdp->i_d.di_nlink >= 2);
3048	if (cdp->i_d.di_nlink != 2) {
3049		error = XFS_ERROR(ENOTEMPTY);
3050		goto error_return;
3051	}
3052	if (!xfs_dir_isempty(cdp)) {
3053		error = XFS_ERROR(ENOTEMPTY);
3054		goto error_return;
3055	}
3056
3057	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3058					&first_block, &free_list, resblks);
3059	if (error)
3060		goto error1;
3061
3062	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3063
3064	/*
3065	 * Bump the in memory generation count on the parent
3066	 * directory so that other can know that it has changed.
3067	 */
3068	dp->i_gen++;
3069
3070	/*
3071	 * Drop the link from cdp's "..".
3072	 */
3073	error = xfs_droplink(tp, dp);
3074	if (error) {
3075		goto error1;
3076	}
3077
3078	/*
3079	 * Drop the link from dp to cdp.
3080	 */
3081	error = xfs_droplink(tp, cdp);
3082	if (error) {
3083		goto error1;
3084	}
3085
3086	/*
3087	 * Drop the "." link from cdp to self.
3088	 */
3089	error = xfs_droplink(tp, cdp);
3090	if (error) {
3091		goto error1;
3092	}
3093
3094	/* Determine these before committing transaction */
3095	last_cdp_link = (cdp)->i_d.di_nlink==0;
3096
3097	/*
3098	 * Take an extra ref on the child vnode so that it
3099	 * does not go to xfs_inactive() from within the commit.
3100	 */
3101	IHOLD(cdp);
3102
3103	/*
3104	 * If this is a synchronous mount, make sure that the
3105	 * rmdir transaction goes to disk before returning to
3106	 * the user.
3107	 */
3108	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3109		xfs_trans_set_sync(tp);
3110	}
3111
3112	error = xfs_bmap_finish (&tp, &free_list, &committed);
3113	if (error) {
3114		xfs_bmap_cancel(&free_list);
3115		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3116				 XFS_TRANS_ABORT));
3117		IRELE(cdp);
3118		goto std_return;
3119	}
3120
3121	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3122	if (error) {
3123		IRELE(cdp);
3124		goto std_return;
3125	}
3126
3127
3128	IRELE(cdp);
3129
3130	/* Fall through to std_return with error = 0 or the errno
3131	 * from xfs_trans_commit. */
3132 std_return:
3133	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3134		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3135					dir_vp, DM_RIGHT_NULL,
3136					NULL, DM_RIGHT_NULL,
3137					name, NULL, dm_di_mode,
3138					error, 0);
3139	}
3140	return error;
3141
3142 error1:
3143	xfs_bmap_cancel(&free_list);
3144	cancel_flags |= XFS_TRANS_ABORT;
3145	/* FALLTHROUGH */
3146
3147 error_return:
3148	xfs_trans_cancel(tp, cancel_flags);
3149	goto std_return;
3150}
3151
3152int
3153xfs_symlink(
3154	xfs_inode_t		*dp,
3155	bhv_vname_t		*dentry,
3156	char			*target_path,
3157	mode_t			mode,
3158	bhv_vnode_t		**vpp,
3159	cred_t			*credp)
3160{
3161	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
3162	xfs_mount_t		*mp = dp->i_mount;
3163	xfs_trans_t		*tp;
3164	xfs_inode_t		*ip;
3165	int			error;
3166	int			pathlen;
3167	xfs_bmap_free_t		free_list;
3168	xfs_fsblock_t		first_block;
3169	boolean_t		unlock_dp_on_error = B_FALSE;
3170	uint			cancel_flags;
3171	int			committed;
3172	xfs_fileoff_t		first_fsb;
3173	xfs_filblks_t		fs_blocks;
3174	int			nmaps;
3175	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
3176	xfs_daddr_t		d;
3177	char			*cur_chunk;
3178	int			byte_cnt;
3179	int			n;
3180	xfs_buf_t		*bp;
3181	xfs_prid_t		prid;
3182	struct xfs_dquot	*udqp, *gdqp;
3183	uint			resblks;
3184	char			*link_name = VNAME(dentry);
3185	int			link_namelen;
3186
3187	*vpp = NULL;
3188	error = 0;
3189	ip = NULL;
3190	tp = NULL;
3191
3192	vn_trace_entry(dp, __FUNCTION__, (inst_t *)__return_address);
3193
3194
3195	if (XFS_FORCED_SHUTDOWN(mp))
3196		return XFS_ERROR(EIO);
3197
3198	link_namelen = VNAMELEN(dentry);
3199
3200	/*
3201	 * Check component lengths of the target path name.
3202	 */
3203	pathlen = strlen(target_path);
3204	if (pathlen >= MAXPATHLEN)      /* total string too long */
3205		return XFS_ERROR(ENAMETOOLONG);
3206	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3207		int len, total;
3208		char *path;
3209
3210		for (total = 0, path = target_path; total < pathlen;) {
3211			/*
3212			 * Skip any slashes.
3213			 */
3214			while(*path == '/') {
3215				total++;
3216				path++;
3217			}
3218
3219			/*
3220			 * Count up to the next slash or end of path.
3221			 * Error out if the component is bigger than MAXNAMELEN.
3222			 */
3223			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3224				if (++len >= MAXNAMELEN) {
3225					error = ENAMETOOLONG;
3226					return error;
3227				}
3228			}
3229		}
3230	}
3231
3232	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3233		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3234					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3235					link_name, target_path, 0, 0, 0);
3236		if (error)
3237			return error;
3238	}
3239
3240	/* Return through std_return after this point. */
3241
3242	udqp = gdqp = NULL;
3243	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3244		prid = dp->i_d.di_projid;
3245	else
3246		prid = (xfs_prid_t)dfltprid;
3247
3248	/*
3249	 * Make sure that we have allocated dquot(s) on disk.
3250	 */
3251	error = XFS_QM_DQVOPALLOC(mp, dp,
3252			current_fsuid(credp), current_fsgid(credp), prid,
3253			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3254	if (error)
3255		goto std_return;
3256
3257	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3258	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3259	/*
3260	 * The symlink will fit into the inode data fork?
3261	 * There can't be any attributes so we get the whole variable part.
3262	 */
3263	if (pathlen <= XFS_LITINO(mp))
3264		fs_blocks = 0;
3265	else
3266		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3267	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3268	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3269			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3270	if (error == ENOSPC && fs_blocks == 0) {
3271		resblks = 0;
3272		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3273				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3274	}
3275	if (error) {
3276		cancel_flags = 0;
3277		goto error_return;
3278	}
3279
3280	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3281	unlock_dp_on_error = B_TRUE;
3282
3283	/*
3284	 * Check whether the directory allows new symlinks or not.
3285	 */
3286	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3287		error = XFS_ERROR(EPERM);
3288		goto error_return;
3289	}
3290
3291	/*
3292	 * Reserve disk quota : blocks and inode.
3293	 */
3294	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3295	if (error)
3296		goto error_return;
3297
3298	/*
3299	 * Check for ability to enter directory entry, if no space reserved.
3300	 */
3301	if (resblks == 0 &&
3302	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3303		goto error_return;
3304	/*
3305	 * Initialize the bmap freelist prior to calling either
3306	 * bmapi or the directory create code.
3307	 */
3308	XFS_BMAP_INIT(&free_list, &first_block);
3309
3310	/*
3311	 * Allocate an inode for the symlink.
3312	 */
3313	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3314			       1, 0, credp, prid, resblks > 0, &ip, NULL);
3315	if (error) {
3316		if (error == ENOSPC)
3317			goto error_return;
3318		goto error1;
3319	}
3320	ITRACE(ip);
3321
3322	/*
3323	 * An error after we've joined dp to the transaction will result in the
3324	 * transaction cancel unlocking dp so don't do it explicitly in the
3325	 * error path.
3326	 */
3327	VN_HOLD(dir_vp);
3328	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3329	unlock_dp_on_error = B_FALSE;
3330
3331	/*
3332	 * Also attach the dquot(s) to it, if applicable.
3333	 */
3334	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3335
3336	if (resblks)
3337		resblks -= XFS_IALLOC_SPACE_RES(mp);
3338	/*
3339	 * If the symlink will fit into the inode, write it inline.
3340	 */
3341	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3342		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3343		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3344		ip->i_d.di_size = pathlen;
3345
3346		/*
3347		 * The inode was initially created in extent format.
3348		 */
3349		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3350		ip->i_df.if_flags |= XFS_IFINLINE;
3351
3352		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3353		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3354
3355	} else {
3356		first_fsb = 0;
3357		nmaps = SYMLINK_MAPS;
3358
3359		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3360				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3361				  &first_block, resblks, mval, &nmaps,
3362				  &free_list, NULL);
3363		if (error) {
3364			goto error1;
3365		}
3366
3367		if (resblks)
3368			resblks -= fs_blocks;
3369		ip->i_d.di_size = pathlen;
3370		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3371
3372		cur_chunk = target_path;
3373		for (n = 0; n < nmaps; n++) {
3374			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3375			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3376			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3377					       BTOBB(byte_cnt), 0);
3378			ASSERT(bp && !XFS_BUF_GETERROR(bp));
3379			if (pathlen < byte_cnt) {
3380				byte_cnt = pathlen;
3381			}
3382			pathlen -= byte_cnt;
3383
3384			memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3385			cur_chunk += byte_cnt;
3386
3387			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3388		}
3389	}
3390
3391	/*
3392	 * Create the directory entry for the symlink.
3393	 */
3394	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3395				   &first_block, &free_list, resblks);
3396	if (error)
3397		goto error1;
3398	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3399	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3400
3401	/*
3402	 * Bump the in memory version number of the parent directory
3403	 * so that other processes accessing it will recognize that
3404	 * the directory has changed.
3405	 */
3406	dp->i_gen++;
3407
3408	/*
3409	 * If this is a synchronous mount, make sure that the
3410	 * symlink transaction goes to disk before returning to
3411	 * the user.
3412	 */
3413	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3414		xfs_trans_set_sync(tp);
3415	}
3416
3417	/*
3418	 * xfs_trans_commit normally decrements the vnode ref count
3419	 * when it unlocks the inode. Since we want to return the
3420	 * vnode to the caller, we bump the vnode ref count now.
3421	 */
3422	IHOLD(ip);
3423
3424	error = xfs_bmap_finish(&tp, &free_list, &committed);
3425	if (error) {
3426		goto error2;
3427	}
3428	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3429	XFS_QM_DQRELE(mp, udqp);
3430	XFS_QM_DQRELE(mp, gdqp);
3431
3432	/* Fall through to std_return with error = 0 or errno from
3433	 * xfs_trans_commit	*/
3434std_return:
3435	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3436		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3437					dir_vp, DM_RIGHT_NULL,
3438					error ? NULL : XFS_ITOV(ip),
3439					DM_RIGHT_NULL, link_name, target_path,
3440					0, error, 0);
3441	}
3442
3443	if (!error) {
3444		bhv_vnode_t *vp;
3445
3446		ASSERT(ip);
3447		vp = XFS_ITOV(ip);
3448		*vpp = vp;
3449	}
3450	return error;
3451
3452 error2:
3453	IRELE(ip);
3454 error1:
3455	xfs_bmap_cancel(&free_list);
3456	cancel_flags |= XFS_TRANS_ABORT;
3457 error_return:
3458	xfs_trans_cancel(tp, cancel_flags);
3459	XFS_QM_DQRELE(mp, udqp);
3460	XFS_QM_DQRELE(mp, gdqp);
3461
3462	if (unlock_dp_on_error)
3463		xfs_iunlock(dp, XFS_ILOCK_EXCL);
3464
3465	goto std_return;
3466}
3467
3468
3469int
3470xfs_fid2(
3471	xfs_inode_t	*ip,
3472	xfs_fid_t	*xfid)
3473{
3474	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3475
3476	xfid->fid_len = sizeof(xfs_fid_t) - sizeof(xfid->fid_len);
3477	xfid->fid_pad = 0;
3478	/*
3479	 * use memcpy because the inode is a long long and there's no
3480	 * assurance that xfid->fid_ino is properly aligned.
3481	 */
3482	memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3483	xfid->fid_gen = ip->i_d.di_gen;
3484
3485	return 0;
3486}
3487
3488
3489int
3490xfs_rwlock(
3491	xfs_inode_t	*ip,
3492	bhv_vrwlock_t	locktype)
3493{
3494	if (S_ISDIR(ip->i_d.di_mode))
3495		return 1;
3496	if (locktype == VRWLOCK_WRITE) {
3497		xfs_ilock(ip, XFS_IOLOCK_EXCL);
3498	} else if (locktype == VRWLOCK_TRY_READ) {
3499		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3500	} else if (locktype == VRWLOCK_TRY_WRITE) {
3501		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3502	} else {
3503		ASSERT((locktype == VRWLOCK_READ) ||
3504		       (locktype == VRWLOCK_WRITE_DIRECT));
3505		xfs_ilock(ip, XFS_IOLOCK_SHARED);
3506	}
3507
3508	return 1;
3509}
3510
3511
3512void
3513xfs_rwunlock(
3514	xfs_inode_t     *ip,
3515	bhv_vrwlock_t	locktype)
3516{
3517 	if (S_ISDIR(ip->i_d.di_mode))
3518  		return;
3519	if (locktype == VRWLOCK_WRITE) {
3520		/*
3521		 * In the write case, we may have added a new entry to
3522		 * the reference cache.  This might store a pointer to
3523		 * an inode to be released in this inode.  If it is there,
3524		 * clear the pointer and release the inode after unlocking
3525		 * this one.
3526		 */
3527		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3528	} else {
3529		ASSERT((locktype == VRWLOCK_READ) ||
3530		       (locktype == VRWLOCK_WRITE_DIRECT));
3531		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3532	}
3533	return;
3534}
3535
3536
3537int
3538xfs_inode_flush(
3539	xfs_inode_t	*ip,
3540	int		flags)
3541{
3542	xfs_mount_t	*mp = ip->i_mount;
3543	xfs_inode_log_item_t *iip = ip->i_itemp;
3544	int		error = 0;
3545
3546	if (XFS_FORCED_SHUTDOWN(mp))
3547		return XFS_ERROR(EIO);
3548
3549	/*
3550	 * Bypass inodes which have already been cleaned by
3551	 * the inode flush clustering code inside xfs_iflush
3552	 */
3553	if ((ip->i_update_core == 0) &&
3554	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3555		return 0;
3556
3557	if (flags & FLUSH_LOG) {
3558		if (iip && iip->ili_last_lsn) {
3559			xlog_t		*log = mp->m_log;
3560			xfs_lsn_t	sync_lsn;
3561			int		s, log_flags = XFS_LOG_FORCE;
3562
3563			s = GRANT_LOCK(log);
3564			sync_lsn = log->l_last_sync_lsn;
3565			GRANT_UNLOCK(log, s);
3566
3567			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3568				if (flags & FLUSH_SYNC)
3569					log_flags |= XFS_LOG_SYNC;
3570				error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3571				if (error)
3572					return error;
3573			}
3574
3575			if (ip->i_update_core == 0)
3576				return 0;
3577		}
3578	}
3579
3580	/*
3581	 * We make this non-blocking if the inode is contended,
3582	 * return EAGAIN to indicate to the caller that they
3583	 * did not succeed. This prevents the flush path from
3584	 * blocking on inodes inside another operation right
3585	 * now, they get caught later by xfs_sync.
3586	 */
3587	if (flags & FLUSH_INODE) {
3588		int	flush_flags;
3589
3590		if (flags & FLUSH_SYNC) {
3591			xfs_ilock(ip, XFS_ILOCK_SHARED);
3592			xfs_iflock(ip);
3593		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3594			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3595				xfs_iunlock(ip, XFS_ILOCK_SHARED);
3596				return EAGAIN;
3597			}
3598		} else {
3599			return EAGAIN;
3600		}
3601
3602		if (flags & FLUSH_SYNC)
3603			flush_flags = XFS_IFLUSH_SYNC;
3604		else
3605			flush_flags = XFS_IFLUSH_ASYNC;
3606
3607		error = xfs_iflush(ip, flush_flags);
3608		xfs_iunlock(ip, XFS_ILOCK_SHARED);
3609	}
3610
3611	return error;
3612}
3613
3614
3615int
3616xfs_set_dmattrs(
3617	xfs_inode_t     *ip,
3618	u_int		evmask,
3619	u_int16_t	state)
3620{
3621	xfs_mount_t	*mp = ip->i_mount;
3622	xfs_trans_t	*tp;
3623	int		error;
3624
3625	if (!capable(CAP_SYS_ADMIN))
3626		return XFS_ERROR(EPERM);
3627
3628	if (XFS_FORCED_SHUTDOWN(mp))
3629		return XFS_ERROR(EIO);
3630
3631	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3632	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3633	if (error) {
3634		xfs_trans_cancel(tp, 0);
3635		return error;
3636	}
3637	xfs_ilock(ip, XFS_ILOCK_EXCL);
3638	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3639
3640	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3641	ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3642
3643	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3644	IHOLD(ip);
3645	error = xfs_trans_commit(tp, 0);
3646
3647	return error;
3648}
3649
3650int
3651xfs_reclaim(
3652	xfs_inode_t	*ip)
3653{
3654	bhv_vnode_t	*vp = XFS_ITOV(ip);
3655
3656	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3657
3658	ASSERT(!VN_MAPPED(vp));
3659
3660	/* bad inode, get out here ASAP */
3661	if (VN_BAD(vp)) {
3662		xfs_ireclaim(ip);
3663		return 0;
3664	}
3665
3666	vn_iowait(ip);
3667
3668	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3669
3670	/*
3671	 * Make sure the atime in the XFS inode is correct before freeing the
3672	 * Linux inode.
3673	 */
3674	xfs_synchronize_atime(ip);
3675
3676	/*
3677	 * If we have nothing to flush with this inode then complete the
3678	 * teardown now, otherwise break the link between the xfs inode and the
3679	 * linux inode and clean up the xfs inode later. This avoids flushing
3680	 * the inode to disk during the delete operation itself.
3681	 *
3682	 * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3683	 * first to ensure that xfs_iunpin() will never see an xfs inode
3684	 * that has a linux inode being reclaimed. Synchronisation is provided
3685	 * by the i_flags_lock.
3686	 */
3687	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3688		xfs_ilock(ip, XFS_ILOCK_EXCL);
3689		xfs_iflock(ip);
3690		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3691	} else {
3692		xfs_mount_t	*mp = ip->i_mount;
3693
3694		/* Protect sync and unpin from us */
3695		XFS_MOUNT_ILOCK(mp);
3696		spin_lock(&ip->i_flags_lock);
3697		__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3698		vn_to_inode(vp)->i_private = NULL;
3699		ip->i_vnode = NULL;
3700		spin_unlock(&ip->i_flags_lock);
3701		list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3702		XFS_MOUNT_IUNLOCK(mp);
3703	}
3704	return 0;
3705}
3706
3707int
3708xfs_finish_reclaim(
3709	xfs_inode_t	*ip,
3710	int		locked,
3711	int		sync_mode)
3712{
3713	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3714	bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
3715	int		error;
3716
3717	if (vp && VN_BAD(vp))
3718		goto reclaim;
3719
3720	/* The hash lock here protects a thread in xfs_iget_core from
3721	 * racing with us on linking the inode back with a vnode.
3722	 * Once we have the XFS_IRECLAIM flag set it will not touch
3723	 * us.
3724	 */
3725	write_lock(&pag->pag_ici_lock);
3726	spin_lock(&ip->i_flags_lock);
3727	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3728	    (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3729		spin_unlock(&ip->i_flags_lock);
3730		write_unlock(&pag->pag_ici_lock);
3731		if (locked) {
3732			xfs_ifunlock(ip);
3733			xfs_iunlock(ip, XFS_ILOCK_EXCL);
3734		}
3735		return 1;
3736	}
3737	__xfs_iflags_set(ip, XFS_IRECLAIM);
3738	spin_unlock(&ip->i_flags_lock);
3739	write_unlock(&pag->pag_ici_lock);
3740	xfs_put_perag(ip->i_mount, pag);
3741
3742	/*
3743	 * If the inode is still dirty, then flush it out.  If the inode
3744	 * is not in the AIL, then it will be OK to flush it delwri as
3745	 * long as xfs_iflush() does not keep any references to the inode.
3746	 * We leave that decision up to xfs_iflush() since it has the
3747	 * knowledge of whether it's OK to simply do a delwri flush of
3748	 * the inode or whether we need to wait until the inode is
3749	 * pulled from the AIL.
3750	 * We get the flush lock regardless, though, just to make sure
3751	 * we don't free it while it is being flushed.
3752	 */
3753	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3754		if (!locked) {
3755			xfs_ilock(ip, XFS_ILOCK_EXCL);
3756			xfs_iflock(ip);
3757		}
3758
3759		if (ip->i_update_core ||
3760		    ((ip->i_itemp != NULL) &&
3761		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
3762			error = xfs_iflush(ip, sync_mode);
3763			/*
3764			 * If we hit an error, typically because of filesystem
3765			 * shutdown, we don't need to let vn_reclaim to know
3766			 * because we're gonna reclaim the inode anyway.
3767			 */
3768			if (error) {
3769				xfs_iunlock(ip, XFS_ILOCK_EXCL);
3770				goto reclaim;
3771			}
3772			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3773		}
3774
3775		ASSERT(ip->i_update_core == 0);
3776		ASSERT(ip->i_itemp == NULL ||
3777		       ip->i_itemp->ili_format.ilf_fields == 0);
3778		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3779	} else if (locked) {
3780		/*
3781		 * We are not interested in doing an iflush if we're
3782		 * in the process of shutting down the filesystem forcibly.
3783		 * So, just reclaim the inode.
3784		 */
3785		xfs_ifunlock(ip);
3786		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3787	}
3788
3789 reclaim:
3790	xfs_ireclaim(ip);
3791	return 0;
3792}
3793
3794int
3795xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3796{
3797	int		purged;
3798	xfs_inode_t	*ip, *n;
3799	int		done = 0;
3800
3801	while (!done) {
3802		purged = 0;
3803		XFS_MOUNT_ILOCK(mp);
3804		list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3805			if (noblock) {
3806				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3807					continue;
3808				if (xfs_ipincount(ip) ||
3809				    !xfs_iflock_nowait(ip)) {
3810					xfs_iunlock(ip, XFS_ILOCK_EXCL);
3811					continue;
3812				}
3813			}
3814			XFS_MOUNT_IUNLOCK(mp);
3815			if (xfs_finish_reclaim(ip, noblock,
3816					XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3817				delay(1);
3818			purged = 1;
3819			break;
3820		}
3821
3822		done = !purged;
3823	}
3824
3825	XFS_MOUNT_IUNLOCK(mp);
3826	return 0;
3827}
3828
3829/*
3830 * xfs_alloc_file_space()
3831 *      This routine allocates disk space for the given file.
3832 *
3833 *	If alloc_type == 0, this request is for an ALLOCSP type
3834 *	request which will change the file size.  In this case, no
3835 *	DMAPI event will be generated by the call.  A TRUNCATE event
3836 *	will be generated later by xfs_setattr.
3837 *
3838 *	If alloc_type != 0, this request is for a RESVSP type
3839 *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
3840 *	lower block boundary byte address is less than the file's
3841 *	length.
3842 *
3843 * RETURNS:
3844 *       0 on success
3845 *      errno on error
3846 *
3847 */
3848STATIC int
3849xfs_alloc_file_space(
3850	xfs_inode_t		*ip,
3851	xfs_off_t		offset,
3852	xfs_off_t		len,
3853	int			alloc_type,
3854	int			attr_flags)
3855{
3856	xfs_mount_t		*mp = ip->i_mount;
3857	xfs_off_t		count;
3858	xfs_filblks_t		allocated_fsb;
3859	xfs_filblks_t		allocatesize_fsb;
3860	xfs_extlen_t		extsz, temp;
3861	xfs_fileoff_t		startoffset_fsb;
3862	xfs_fsblock_t		firstfsb;
3863	int			nimaps;
3864	int			bmapi_flag;
3865	int			quota_flag;
3866	int			rt;
3867	xfs_trans_t		*tp;
3868	xfs_bmbt_irec_t		imaps[1], *imapp;
3869	xfs_bmap_free_t		free_list;
3870	uint			qblocks, resblks, resrtextents;
3871	int			committed;
3872	int			error;
3873
3874	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
3875
3876	if (XFS_FORCED_SHUTDOWN(mp))
3877		return XFS_ERROR(EIO);
3878
3879	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3880		return error;
3881
3882	if (len <= 0)
3883		return XFS_ERROR(EINVAL);
3884
3885	rt = XFS_IS_REALTIME_INODE(ip);
3886	extsz = xfs_get_extsz_hint(ip);
3887
3888	count = len;
3889	imapp = &imaps[0];
3890	nimaps = 1;
3891	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3892	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
3893	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3894
3895	/*	Generate a DMAPI event if needed.	*/
3896	if (alloc_type != 0 && offset < ip->i_size &&
3897			(attr_flags&ATTR_DMI) == 0  &&
3898			DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3899		xfs_off_t           end_dmi_offset;
3900
3901		end_dmi_offset = offset+len;
3902		if (end_dmi_offset > ip->i_size)
3903			end_dmi_offset = ip->i_size;
3904		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
3905			offset, end_dmi_offset - offset,
3906			0, NULL);
3907		if (error)
3908			return error;
3909	}
3910
3911	/*
3912	 * Allocate file space until done or until there is an error
3913	 */
3914retry:
3915	while (allocatesize_fsb && !error) {
3916		xfs_fileoff_t	s, e;
3917
3918		/*
3919		 * Determine space reservations for data/realtime.
3920		 */
3921		if (unlikely(extsz)) {
3922			s = startoffset_fsb;
3923			do_div(s, extsz);
3924			s *= extsz;
3925			e = startoffset_fsb + allocatesize_fsb;
3926			if ((temp = do_mod(startoffset_fsb, extsz)))
3927				e += temp;
3928			if ((temp = do_mod(e, extsz)))
3929				e += extsz - temp;
3930		} else {
3931			s = 0;
3932			e = allocatesize_fsb;
3933		}
3934
3935		if (unlikely(rt)) {
3936			resrtextents = qblocks = (uint)(e - s);
3937			resrtextents /= mp->m_sb.sb_rextsize;
3938			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3939			quota_flag = XFS_QMOPT_RES_RTBLKS;
3940		} else {
3941			resrtextents = 0;
3942			resblks = qblocks = \
3943				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3944			quota_flag = XFS_QMOPT_RES_REGBLKS;
3945		}
3946
3947		/*
3948		 * Allocate and setup the transaction.
3949		 */
3950		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3951		error = xfs_trans_reserve(tp, resblks,
3952					  XFS_WRITE_LOG_RES(mp), resrtextents,
3953					  XFS_TRANS_PERM_LOG_RES,
3954					  XFS_WRITE_LOG_COUNT);
3955		/*
3956		 * Check for running out of space
3957		 */
3958		if (error) {
3959			/*
3960			 * Free the transaction structure.
3961			 */
3962			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3963			xfs_trans_cancel(tp, 0);
3964			break;
3965		}
3966		xfs_ilock(ip, XFS_ILOCK_EXCL);
3967		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3968						      qblocks, 0, quota_flag);
3969		if (error)
3970			goto error1;
3971
3972		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3973		xfs_trans_ihold(tp, ip);
3974
3975		/*
3976		 * Issue the xfs_bmapi() call to allocate the blocks
3977		 */
3978		XFS_BMAP_INIT(&free_list, &firstfsb);
3979		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
3980				  allocatesize_fsb, bmapi_flag,
3981				  &firstfsb, 0, imapp, &nimaps,
3982				  &free_list, NULL);
3983		if (error) {
3984			goto error0;
3985		}
3986
3987		/*
3988		 * Complete the transaction
3989		 */
3990		error = xfs_bmap_finish(&tp, &free_list, &committed);
3991		if (error) {
3992			goto error0;
3993		}
3994
3995		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3996		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3997		if (error) {
3998			break;
3999		}
4000
4001		allocated_fsb = imapp->br_blockcount;
4002
4003		if (nimaps == 0) {
4004			error = XFS_ERROR(ENOSPC);
4005			break;
4006		}
4007
4008		startoffset_fsb += allocated_fsb;
4009		allocatesize_fsb -= allocated_fsb;
4010	}
4011dmapi_enospc_check:
4012	if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
4013	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
4014		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4015				XFS_ITOV(ip), DM_RIGHT_NULL,
4016				XFS_ITOV(ip), DM_RIGHT_NULL,
4017				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4018		if (error == 0)
4019			goto retry;	/* Maybe DMAPI app. has made space */
4020		/* else fall through with error from XFS_SEND_DATA */
4021	}
4022
4023	return error;
4024
4025error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4026	xfs_bmap_cancel(&free_list);
4027	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4028
4029error1:	/* Just cancel transaction */
4030	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4031	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4032	goto dmapi_enospc_check;
4033}
4034
4035/*
4036 * Zero file bytes between startoff and endoff inclusive.
4037 * The iolock is held exclusive and no blocks are buffered.
4038 */
4039STATIC int
4040xfs_zero_remaining_bytes(
4041	xfs_inode_t		*ip,
4042	xfs_off_t		startoff,
4043	xfs_off_t		endoff)
4044{
4045	xfs_bmbt_irec_t		imap;
4046	xfs_fileoff_t		offset_fsb;
4047	xfs_off_t		lastoffset;
4048	xfs_off_t		offset;
4049	xfs_buf_t		*bp;
4050	xfs_mount_t		*mp = ip->i_mount;
4051	int			nimap;
4052	int			error = 0;
4053
4054	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4055				ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4056				mp->m_rtdev_targp : mp->m_ddev_targp);
4057
4058	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4059		offset_fsb = XFS_B_TO_FSBT(mp, offset);
4060		nimap = 1;
4061		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4062			NULL, 0, &imap, &nimap, NULL, NULL);
4063		if (error || nimap < 1)
4064			break;
4065		ASSERT(imap.br_blockcount >= 1);
4066		ASSERT(imap.br_startoff == offset_fsb);
4067		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4068		if (lastoffset > endoff)
4069			lastoffset = endoff;
4070		if (imap.br_startblock == HOLESTARTBLOCK)
4071			continue;
4072		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4073		if (imap.br_state == XFS_EXT_UNWRITTEN)
4074			continue;
4075		XFS_BUF_UNDONE(bp);
4076		XFS_BUF_UNWRITE(bp);
4077		XFS_BUF_READ(bp);
4078		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4079		xfsbdstrat(mp, bp);
4080		if ((error = xfs_iowait(bp))) {
4081			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4082					  mp, bp, XFS_BUF_ADDR(bp));
4083			break;
4084		}
4085		memset(XFS_BUF_PTR(bp) +
4086			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4087		      0, lastoffset - offset + 1);
4088		XFS_BUF_UNDONE(bp);
4089		XFS_BUF_UNREAD(bp);
4090		XFS_BUF_WRITE(bp);
4091		xfsbdstrat(mp, bp);
4092		if ((error = xfs_iowait(bp))) {
4093			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4094					  mp, bp, XFS_BUF_ADDR(bp));
4095			break;
4096		}
4097	}
4098	xfs_buf_free(bp);
4099	return error;
4100}
4101
4102/*
4103 * xfs_free_file_space()
4104 *      This routine frees disk space for the given file.
4105 *
4106 *	This routine is only called by xfs_change_file_space
4107 *	for an UNRESVSP type call.
4108 *
4109 * RETURNS:
4110 *       0 on success
4111 *      errno on error
4112 *
4113 */
4114STATIC int
4115xfs_free_file_space(
4116	xfs_inode_t		*ip,
4117	xfs_off_t		offset,
4118	xfs_off_t		len,
4119	int			attr_flags)
4120{
4121	bhv_vnode_t		*vp;
4122	int			committed;
4123	int			done;
4124	xfs_off_t		end_dmi_offset;
4125	xfs_fileoff_t		endoffset_fsb;
4126	int			error;
4127	xfs_fsblock_t		firstfsb;
4128	xfs_bmap_free_t		free_list;
4129	xfs_bmbt_irec_t		imap;
4130	xfs_off_t		ioffset;
4131	xfs_extlen_t		mod=0;
4132	xfs_mount_t		*mp;
4133	int			nimap;
4134	uint			resblks;
4135	uint			rounding;
4136	int			rt;
4137	xfs_fileoff_t		startoffset_fsb;
4138	xfs_trans_t		*tp;
4139	int			need_iolock = 1;
4140
4141	vp = XFS_ITOV(ip);
4142	mp = ip->i_mount;
4143
4144	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
4145
4146	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4147		return error;
4148
4149	error = 0;
4150	if (len <= 0)	/* if nothing being freed */
4151		return error;
4152	rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4153	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
4154	end_dmi_offset = offset + len;
4155	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4156
4157	if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4158	    DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4159		if (end_dmi_offset > ip->i_size)
4160			end_dmi_offset = ip->i_size;
4161		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4162				offset, end_dmi_offset - offset,
4163				AT_DELAY_FLAG(attr_flags), NULL);
4164		if (error)
4165			return error;
4166	}
4167
4168	if (attr_flags & ATTR_NOLOCK)
4169		need_iolock = 0;
4170	if (need_iolock) {
4171		xfs_ilock(ip, XFS_IOLOCK_EXCL);
4172		vn_iowait(ip);	/* wait for the completion of any pending DIOs */
4173	}
4174
4175	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4176	ioffset = offset & ~(rounding - 1);
4177
4178	if (VN_CACHED(vp) != 0) {
4179		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4180				ctooff(offtoct(ioffset)), -1);
4181		error = xfs_flushinval_pages(ip,
4182				ctooff(offtoct(ioffset)),
4183				-1, FI_REMAPF_LOCKED);
4184		if (error)
4185			goto out_unlock_iolock;
4186	}
4187
4188	/*
4189	 * Need to zero the stuff we're not freeing, on disk.
4190	 * If its a realtime file & can't use unwritten extents then we
4191	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4192	 * will take care of it for us.
4193	 */
4194	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4195		nimap = 1;
4196		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4197			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4198		if (error)
4199			goto out_unlock_iolock;
4200		ASSERT(nimap == 0 || nimap == 1);
4201		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4202			xfs_daddr_t	block;
4203
4204			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4205			block = imap.br_startblock;
4206			mod = do_div(block, mp->m_sb.sb_rextsize);
4207			if (mod)
4208				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4209		}
4210		nimap = 1;
4211		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4212			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4213		if (error)
4214			goto out_unlock_iolock;
4215		ASSERT(nimap == 0 || nimap == 1);
4216		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4217			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4218			mod++;
4219			if (mod && (mod != mp->m_sb.sb_rextsize))
4220				endoffset_fsb -= mod;
4221		}
4222	}
4223	if ((done = (endoffset_fsb <= startoffset_fsb)))
4224		/*
4225		 * One contiguous piece to clear
4226		 */
4227		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4228	else {
4229		/*
4230		 * Some full blocks, possibly two pieces to clear
4231		 */
4232		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4233			error = xfs_zero_remaining_bytes(ip, offset,
4234				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4235		if (!error &&
4236		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4237			error = xfs_zero_remaining_bytes(ip,
4238				XFS_FSB_TO_B(mp, endoffset_fsb),
4239				offset + len - 1);
4240	}
4241
4242	/*
4243	 * free file space until done or until there is an error
4244	 */
4245	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4246	while (!error && !done) {
4247
4248		/*
4249		 * allocate and setup the transaction. Allow this
4250		 * transaction to dip into the reserve blocks to ensure
4251		 * the freeing of the space succeeds at ENOSPC.
4252		 */
4253		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4254		tp->t_flags |= XFS_TRANS_RESERVE;
4255		error = xfs_trans_reserve(tp,
4256					  resblks,
4257					  XFS_WRITE_LOG_RES(mp),
4258					  0,
4259					  XFS_TRANS_PERM_LOG_RES,
4260					  XFS_WRITE_LOG_COUNT);
4261
4262		/*
4263		 * check for running out of space
4264		 */
4265		if (error) {
4266			/*
4267			 * Free the transaction structure.
4268			 */
4269			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4270			xfs_trans_cancel(tp, 0);
4271			break;
4272		}
4273		xfs_ilock(ip, XFS_ILOCK_EXCL);
4274		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4275				ip->i_udquot, ip->i_gdquot, resblks, 0,
4276				XFS_QMOPT_RES_REGBLKS);
4277		if (error)
4278			goto error1;
4279
4280		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4281		xfs_trans_ihold(tp, ip);
4282
4283		/*
4284		 * issue the bunmapi() call to free the blocks
4285		 */
4286		XFS_BMAP_INIT(&free_list, &firstfsb);
4287		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4288				  endoffset_fsb - startoffset_fsb,
4289				  0, 2, &firstfsb, &free_list, NULL, &done);
4290		if (error) {
4291			goto error0;
4292		}
4293
4294		/*
4295		 * complete the transaction
4296		 */
4297		error = xfs_bmap_finish(&tp, &free_list, &committed);
4298		if (error) {
4299			goto error0;
4300		}
4301
4302		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4303		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4304	}
4305
4306 out_unlock_iolock:
4307	if (need_iolock)
4308		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4309	return error;
4310
4311 error0:
4312	xfs_bmap_cancel(&free_list);
4313 error1:
4314	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4315	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4316		    XFS_ILOCK_EXCL);
4317	return error;
4318}
4319
4320/*
4321 * xfs_change_file_space()
4322 *      This routine allocates or frees disk space for the given file.
4323 *      The user specified parameters are checked for alignment and size
4324 *      limitations.
4325 *
4326 * RETURNS:
4327 *       0 on success
4328 *      errno on error
4329 *
4330 */
4331int
4332xfs_change_file_space(
4333	xfs_inode_t	*ip,
4334	int		cmd,
4335	xfs_flock64_t	*bf,
4336	xfs_off_t	offset,
4337	cred_t		*credp,
4338	int		attr_flags)
4339{
4340	xfs_mount_t	*mp = ip->i_mount;
4341	int		clrprealloc;
4342	int		error;
4343	xfs_fsize_t	fsize;
4344	int		setprealloc;
4345	xfs_off_t	startoffset;
4346	xfs_off_t	llen;
4347	xfs_trans_t	*tp;
4348	bhv_vattr_t	va;
4349
4350	vn_trace_entry(ip, __FUNCTION__, (inst_t *)__return_address);
4351
4352	/*
4353	 * must be a regular file and have write permission
4354	 */
4355	if (!S_ISREG(ip->i_d.di_mode))
4356		return XFS_ERROR(EINVAL);
4357
4358	xfs_ilock(ip, XFS_ILOCK_SHARED);
4359
4360	if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4361		xfs_iunlock(ip, XFS_ILOCK_SHARED);
4362		return error;
4363	}
4364
4365	xfs_iunlock(ip, XFS_ILOCK_SHARED);
4366
4367	switch (bf->l_whence) {
4368	case 0: /*SEEK_SET*/
4369		break;
4370	case 1: /*SEEK_CUR*/
4371		bf->l_start += offset;
4372		break;
4373	case 2: /*SEEK_END*/
4374		bf->l_start += ip->i_size;
4375		break;
4376	default:
4377		return XFS_ERROR(EINVAL);
4378	}
4379
4380	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4381
4382	if (   (bf->l_start < 0)
4383	    || (bf->l_start > XFS_MAXIOFFSET(mp))
4384	    || (bf->l_start + llen < 0)
4385	    || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4386		return XFS_ERROR(EINVAL);
4387
4388	bf->l_whence = 0;
4389
4390	startoffset = bf->l_start;
4391	fsize = ip->i_size;
4392
4393	/*
4394	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4395	 * file space.
4396	 * These calls do NOT zero the data space allocated to the file,
4397	 * nor do they change the file size.
4398	 *
4399	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4400	 * space.
4401	 * These calls cause the new file data to be zeroed and the file
4402	 * size to be changed.
4403	 */
4404	setprealloc = clrprealloc = 0;
4405
4406	switch (cmd) {
4407	case XFS_IOC_RESVSP:
4408	case XFS_IOC_RESVSP64:
4409		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4410								1, attr_flags);
4411		if (error)
4412			return error;
4413		setprealloc = 1;
4414		break;
4415
4416	case XFS_IOC_UNRESVSP:
4417	case XFS_IOC_UNRESVSP64:
4418		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4419								attr_flags)))
4420			return error;
4421		break;
4422
4423	case XFS_IOC_ALLOCSP:
4424	case XFS_IOC_ALLOCSP64:
4425	case XFS_IOC_FREESP:
4426	case XFS_IOC_FREESP64:
4427		if (startoffset > fsize) {
4428			error = xfs_alloc_file_space(ip, fsize,
4429					startoffset - fsize, 0, attr_flags);
4430			if (error)
4431				break;
4432		}
4433
4434		va.va_mask = XFS_AT_SIZE;
4435		va.va_size = startoffset;
4436
4437		error = xfs_setattr(ip, &va, attr_flags, credp);
4438
4439		if (error)
4440			return error;
4441
4442		clrprealloc = 1;
4443		break;
4444
4445	default:
4446		ASSERT(0);
4447		return XFS_ERROR(EINVAL);
4448	}
4449
4450	/*
4451	 * update the inode timestamp, mode, and prealloc flag bits
4452	 */
4453	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4454
4455	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4456				      0, 0, 0))) {
4457		/* ASSERT(0); */
4458		xfs_trans_cancel(tp, 0);
4459		return error;
4460	}
4461
4462	xfs_ilock(ip, XFS_ILOCK_EXCL);
4463
4464	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4465	xfs_trans_ihold(tp, ip);
4466
4467	if ((attr_flags & ATTR_DMI) == 0) {
4468		ip->i_d.di_mode &= ~S_ISUID;
4469
4470		/*
4471		 * Note that we don't have to worry about mandatory
4472		 * file locking being disabled here because we only
4473		 * clear the S_ISGID bit if the Group execute bit is
4474		 * on, but if it was on then mandatory locking wouldn't
4475		 * have been enabled.
4476		 */
4477		if (ip->i_d.di_mode & S_IXGRP)
4478			ip->i_d.di_mode &= ~S_ISGID;
4479
4480		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4481	}
4482	if (setprealloc)
4483		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4484	else if (clrprealloc)
4485		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4486
4487	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4488	xfs_trans_set_sync(tp);
4489
4490	error = xfs_trans_commit(tp, 0);
4491
4492	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4493
4494	return error;
4495}
Configure Feed

Configure Feed