fs/xfs/xfs_vnodeops.c at v2.6.23-rc2

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / xfs / xfs_vnodeops.c
at v2.6.23-rc2 4755 lines 119 kB view raw
wrap content
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18
  19#include "xfs.h"
  20#include "xfs_fs.h"
  21#include "xfs_types.h"
  22#include "xfs_bit.h"
  23#include "xfs_log.h"
  24#include "xfs_inum.h"
  25#include "xfs_trans.h"
  26#include "xfs_sb.h"
  27#include "xfs_ag.h"
  28#include "xfs_dir2.h"
  29#include "xfs_dmapi.h"
  30#include "xfs_mount.h"
  31#include "xfs_da_btree.h"
  32#include "xfs_bmap_btree.h"
  33#include "xfs_alloc_btree.h"
  34#include "xfs_ialloc_btree.h"
  35#include "xfs_dir2_sf.h"
  36#include "xfs_attr_sf.h"
  37#include "xfs_dinode.h"
  38#include "xfs_inode.h"
  39#include "xfs_inode_item.h"
  40#include "xfs_itable.h"
  41#include "xfs_btree.h"
  42#include "xfs_ialloc.h"
  43#include "xfs_alloc.h"
  44#include "xfs_bmap.h"
  45#include "xfs_attr.h"
  46#include "xfs_rw.h"
  47#include "xfs_error.h"
  48#include "xfs_quota.h"
  49#include "xfs_utils.h"
  50#include "xfs_rtalloc.h"
  51#include "xfs_refcache.h"
  52#include "xfs_trans_space.h"
  53#include "xfs_log_priv.h"
  54#include "xfs_filestream.h"
  55
  56STATIC int
  57xfs_open(
  58	bhv_desc_t	*bdp,
  59	cred_t		*credp)
  60{
  61	int		mode;
  62	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
  63	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
  64
  65	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  66		return XFS_ERROR(EIO);
  67
  68	/*
  69	 * If it's a directory with any blocks, read-ahead block 0
  70	 * as we're almost certain to have the next operation be a read there.
  71	 */
  72	if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  73		mode = xfs_ilock_map_shared(ip);
  74		if (ip->i_d.di_nextents > 0)
  75			(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  76		xfs_iunlock(ip, mode);
  77	}
  78	return 0;
  79}
  80
  81/*
  82 * xfs_getattr
  83 */
  84STATIC int
  85xfs_getattr(
  86	bhv_desc_t	*bdp,
  87	bhv_vattr_t	*vap,
  88	int		flags,
  89	cred_t		*credp)
  90{
  91	xfs_inode_t	*ip;
  92	xfs_mount_t	*mp;
  93	bhv_vnode_t	*vp;
  94
  95	vp  = BHV_TO_VNODE(bdp);
  96	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
  97
  98	ip = XFS_BHVTOI(bdp);
  99	mp = ip->i_mount;
 100
 101	if (XFS_FORCED_SHUTDOWN(mp))
 102		return XFS_ERROR(EIO);
 103
 104	if (!(flags & ATTR_LAZY))
 105		xfs_ilock(ip, XFS_ILOCK_SHARED);
 106
 107	vap->va_size = XFS_ISIZE(ip);
 108	if (vap->va_mask == XFS_AT_SIZE)
 109		goto all_done;
 110
 111	vap->va_nblocks =
 112		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 113	vap->va_nodeid = ip->i_ino;
 114#if XFS_BIG_INUMS
 115	vap->va_nodeid += mp->m_inoadd;
 116#endif
 117	vap->va_nlink = ip->i_d.di_nlink;
 118
 119	/*
 120	 * Quick exit for non-stat callers
 121	 */
 122	if ((vap->va_mask &
 123	    ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 124	      XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 125		goto all_done;
 126
 127	/*
 128	 * Copy from in-core inode.
 129	 */
 130	vap->va_mode = ip->i_d.di_mode;
 131	vap->va_uid = ip->i_d.di_uid;
 132	vap->va_gid = ip->i_d.di_gid;
 133	vap->va_projid = ip->i_d.di_projid;
 134
 135	/*
 136	 * Check vnode type block/char vs. everything else.
 137	 */
 138	switch (ip->i_d.di_mode & S_IFMT) {
 139	case S_IFBLK:
 140	case S_IFCHR:
 141		vap->va_rdev = ip->i_df.if_u2.if_rdev;
 142		vap->va_blocksize = BLKDEV_IOSIZE;
 143		break;
 144	default:
 145		vap->va_rdev = 0;
 146
 147		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 148			vap->va_blocksize = xfs_preferred_iosize(mp);
 149		} else {
 150
 151			/*
 152			 * If the file blocks are being allocated from a
 153			 * realtime partition, then return the inode's
 154			 * realtime extent size or the realtime volume's
 155			 * extent size.
 156			 */
 157			vap->va_blocksize =
 158				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 159		}
 160		break;
 161	}
 162
 163	vn_atime_to_timespec(vp, &vap->va_atime);
 164	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 165	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 166	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 167	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 168
 169	/*
 170	 * Exit for stat callers.  See if any of the rest of the fields
 171	 * to be filled in are needed.
 172	 */
 173	if ((vap->va_mask &
 174	     (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 175	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 176		goto all_done;
 177
 178	/*
 179	 * Convert di_flags to xflags.
 180	 */
 181	vap->va_xflags = xfs_ip2xflags(ip);
 182
 183	/*
 184	 * Exit for inode revalidate.  See if any of the rest of
 185	 * the fields to be filled in are needed.
 186	 */
 187	if ((vap->va_mask &
 188	     (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 189	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 190		goto all_done;
 191
 192	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 193	vap->va_nextents =
 194		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
 195			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 196			ip->i_d.di_nextents;
 197	if (ip->i_afp)
 198		vap->va_anextents =
 199			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 200				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 201				 ip->i_d.di_anextents;
 202	else
 203		vap->va_anextents = 0;
 204	vap->va_gen = ip->i_d.di_gen;
 205
 206 all_done:
 207	if (!(flags & ATTR_LAZY))
 208		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 209	return 0;
 210}
 211
 212
 213/*
 214 * xfs_setattr
 215 */
 216int
 217xfs_setattr(
 218	bhv_desc_t		*bdp,
 219	bhv_vattr_t		*vap,
 220	int			flags,
 221	cred_t			*credp)
 222{
 223	xfs_inode_t		*ip;
 224	xfs_trans_t		*tp;
 225	xfs_mount_t		*mp;
 226	int			mask;
 227	int			code;
 228	uint			lock_flags;
 229	uint			commit_flags=0;
 230	uid_t			uid=0, iuid=0;
 231	gid_t			gid=0, igid=0;
 232	int			timeflags = 0;
 233	bhv_vnode_t		*vp;
 234	xfs_prid_t		projid=0, iprojid=0;
 235	int			mandlock_before, mandlock_after;
 236	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 237	int			file_owner;
 238	int			need_iolock = 1;
 239
 240	vp = BHV_TO_VNODE(bdp);
 241	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 242
 243	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 244		return XFS_ERROR(EROFS);
 245
 246	/*
 247	 * Cannot set certain attributes.
 248	 */
 249	mask = vap->va_mask;
 250	if (mask & XFS_AT_NOSET) {
 251		return XFS_ERROR(EINVAL);
 252	}
 253
 254	ip = XFS_BHVTOI(bdp);
 255	mp = ip->i_mount;
 256
 257	if (XFS_FORCED_SHUTDOWN(mp))
 258		return XFS_ERROR(EIO);
 259
 260	/*
 261	 * Timestamps do not need to be logged and hence do not
 262	 * need to be done within a transaction.
 263	 */
 264	if (mask & XFS_AT_UPDTIMES) {
 265		ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 266		timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 267			    ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 268			    ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 269		xfs_ichgtime(ip, timeflags);
 270		return 0;
 271	}
 272
 273	olddquot1 = olddquot2 = NULL;
 274	udqp = gdqp = NULL;
 275
 276	/*
 277	 * If disk quotas is on, we make sure that the dquots do exist on disk,
 278	 * before we start any other transactions. Trying to do this later
 279	 * is messy. We don't care to take a readlock to look at the ids
 280	 * in inode here, because we can't hold it across the trans_reserve.
 281	 * If the IDs do change before we take the ilock, we're covered
 282	 * because the i_*dquot fields will get updated anyway.
 283	 */
 284	if (XFS_IS_QUOTA_ON(mp) &&
 285	    (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 286		uint	qflags = 0;
 287
 288		if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 289			uid = vap->va_uid;
 290			qflags |= XFS_QMOPT_UQUOTA;
 291		} else {
 292			uid = ip->i_d.di_uid;
 293		}
 294		if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 295			gid = vap->va_gid;
 296			qflags |= XFS_QMOPT_GQUOTA;
 297		}  else {
 298			gid = ip->i_d.di_gid;
 299		}
 300		if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 301			projid = vap->va_projid;
 302			qflags |= XFS_QMOPT_PQUOTA;
 303		}  else {
 304			projid = ip->i_d.di_projid;
 305		}
 306		/*
 307		 * We take a reference when we initialize udqp and gdqp,
 308		 * so it is important that we never blindly double trip on
 309		 * the same variable. See xfs_create() for an example.
 310		 */
 311		ASSERT(udqp == NULL);
 312		ASSERT(gdqp == NULL);
 313		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 314					 &udqp, &gdqp);
 315		if (code)
 316			return code;
 317	}
 318
 319	/*
 320	 * For the other attributes, we acquire the inode lock and
 321	 * first do an error checking pass.
 322	 */
 323	tp = NULL;
 324	lock_flags = XFS_ILOCK_EXCL;
 325	if (flags & ATTR_NOLOCK)
 326		need_iolock = 0;
 327	if (!(mask & XFS_AT_SIZE)) {
 328		if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 329		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
 330			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 331			commit_flags = 0;
 332			if ((code = xfs_trans_reserve(tp, 0,
 333						     XFS_ICHANGE_LOG_RES(mp), 0,
 334						     0, 0))) {
 335				lock_flags = 0;
 336				goto error_return;
 337			}
 338		}
 339	} else {
 340		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 341		    !(flags & ATTR_DMI)) {
 342			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 343			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 344				vap->va_size, 0, dmflags, NULL);
 345			if (code) {
 346				lock_flags = 0;
 347				goto error_return;
 348			}
 349		}
 350		if (need_iolock)
 351			lock_flags |= XFS_IOLOCK_EXCL;
 352	}
 353
 354	xfs_ilock(ip, lock_flags);
 355
 356	/* boolean: are we the file owner? */
 357	file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 358
 359	/*
 360	 * Change various properties of a file.
 361	 * Only the owner or users with CAP_FOWNER
 362	 * capability may do these things.
 363	 */
 364	if (mask &
 365	    (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 366	     XFS_AT_GID|XFS_AT_PROJID)) {
 367		/*
 368		 * CAP_FOWNER overrides the following restrictions:
 369		 *
 370		 * The user ID of the calling process must be equal
 371		 * to the file owner ID, except in cases where the
 372		 * CAP_FSETID capability is applicable.
 373		 */
 374		if (!file_owner && !capable(CAP_FOWNER)) {
 375			code = XFS_ERROR(EPERM);
 376			goto error_return;
 377		}
 378
 379		/*
 380		 * CAP_FSETID overrides the following restrictions:
 381		 *
 382		 * The effective user ID of the calling process shall match
 383		 * the file owner when setting the set-user-ID and
 384		 * set-group-ID bits on that file.
 385		 *
 386		 * The effective group ID or one of the supplementary group
 387		 * IDs of the calling process shall match the group owner of
 388		 * the file when setting the set-group-ID bit on that file
 389		 */
 390		if (mask & XFS_AT_MODE) {
 391			mode_t m = 0;
 392
 393			if ((vap->va_mode & S_ISUID) && !file_owner)
 394				m |= S_ISUID;
 395			if ((vap->va_mode & S_ISGID) &&
 396			    !in_group_p((gid_t)ip->i_d.di_gid))
 397				m |= S_ISGID;
 398#if 0
 399			/* Linux allows this, Irix doesn't. */
 400			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 401				m |= S_ISVTX;
 402#endif
 403			if (m && !capable(CAP_FSETID))
 404				vap->va_mode &= ~m;
 405		}
 406	}
 407
 408	/*
 409	 * Change file ownership.  Must be the owner or privileged.
 410	 * If the system was configured with the "restricted_chown"
 411	 * option, the owner is not permitted to give away the file,
 412	 * and can change the group id only to a group of which he
 413	 * or she is a member.
 414	 */
 415	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 416		/*
 417		 * These IDs could have changed since we last looked at them.
 418		 * But, we're assured that if the ownership did change
 419		 * while we didn't have the inode locked, inode's dquot(s)
 420		 * would have changed also.
 421		 */
 422		iuid = ip->i_d.di_uid;
 423		iprojid = ip->i_d.di_projid;
 424		igid = ip->i_d.di_gid;
 425		gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 426		uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 427		projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 428			 iprojid;
 429
 430		/*
 431		 * CAP_CHOWN overrides the following restrictions:
 432		 *
 433		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 434		 * shall override the restriction that a process cannot
 435		 * change the user ID of a file it owns and the restriction
 436		 * that the group ID supplied to the chown() function
 437		 * shall be equal to either the group ID or one of the
 438		 * supplementary group IDs of the calling process.
 439		 */
 440		if (restricted_chown &&
 441		    (iuid != uid || (igid != gid &&
 442				     !in_group_p((gid_t)gid))) &&
 443		    !capable(CAP_CHOWN)) {
 444			code = XFS_ERROR(EPERM);
 445			goto error_return;
 446		}
 447		/*
 448		 * Do a quota reservation only if uid/projid/gid is actually
 449		 * going to change.
 450		 */
 451		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 452		    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 453		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 454			ASSERT(tp);
 455			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 456						capable(CAP_FOWNER) ?
 457						XFS_QMOPT_FORCE_RES : 0);
 458			if (code)	/* out of quota */
 459				goto error_return;
 460		}
 461	}
 462
 463	/*
 464	 * Truncate file.  Must have write permission and not be a directory.
 465	 */
 466	if (mask & XFS_AT_SIZE) {
 467		/* Short circuit the truncate case for zero length files */
 468		if ((vap->va_size == 0) &&
 469		   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 470			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 471			lock_flags &= ~XFS_ILOCK_EXCL;
 472			if (mask & XFS_AT_CTIME)
 473				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 474			code = 0;
 475			goto error_return;
 476		}
 477
 478		if (VN_ISDIR(vp)) {
 479			code = XFS_ERROR(EISDIR);
 480			goto error_return;
 481		} else if (!VN_ISREG(vp)) {
 482			code = XFS_ERROR(EINVAL);
 483			goto error_return;
 484		}
 485		/*
 486		 * Make sure that the dquots are attached to the inode.
 487		 */
 488		if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 489			goto error_return;
 490	}
 491
 492	/*
 493	 * Change file access or modified times.
 494	 */
 495	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 496		if (!file_owner) {
 497			if ((flags & ATTR_UTIME) &&
 498			    !capable(CAP_FOWNER)) {
 499				code = XFS_ERROR(EPERM);
 500				goto error_return;
 501			}
 502		}
 503	}
 504
 505	/*
 506	 * Change extent size or realtime flag.
 507	 */
 508	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 509		/*
 510		 * Can't change extent size if any extents are allocated.
 511		 */
 512		if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 513		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 514		     vap->va_extsize) ) {
 515			code = XFS_ERROR(EINVAL);	/* EFBIG? */
 516			goto error_return;
 517		}
 518
 519		/*
 520		 * Can't change realtime flag if any extents are allocated.
 521		 */
 522		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 523		    (mask & XFS_AT_XFLAGS) &&
 524		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 525		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 526			code = XFS_ERROR(EINVAL);	/* EFBIG? */
 527			goto error_return;
 528		}
 529		/*
 530		 * Extent size must be a multiple of the appropriate block
 531		 * size, if set at all.
 532		 */
 533		if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 534			xfs_extlen_t	size;
 535
 536			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 537			    ((mask & XFS_AT_XFLAGS) &&
 538			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 539				size = mp->m_sb.sb_rextsize <<
 540				       mp->m_sb.sb_blocklog;
 541			} else {
 542				size = mp->m_sb.sb_blocksize;
 543			}
 544			if (vap->va_extsize % size) {
 545				code = XFS_ERROR(EINVAL);
 546				goto error_return;
 547			}
 548		}
 549		/*
 550		 * If realtime flag is set then must have realtime data.
 551		 */
 552		if ((mask & XFS_AT_XFLAGS) &&
 553		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 554			if ((mp->m_sb.sb_rblocks == 0) ||
 555			    (mp->m_sb.sb_rextsize == 0) ||
 556			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 557				code = XFS_ERROR(EINVAL);
 558				goto error_return;
 559			}
 560		}
 561
 562		/*
 563		 * Can't modify an immutable/append-only file unless
 564		 * we have appropriate permission.
 565		 */
 566		if ((mask & XFS_AT_XFLAGS) &&
 567		    (ip->i_d.di_flags &
 568				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 569		     (vap->va_xflags &
 570				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 571		    !capable(CAP_LINUX_IMMUTABLE)) {
 572			code = XFS_ERROR(EPERM);
 573			goto error_return;
 574		}
 575	}
 576
 577	/*
 578	 * Now we can make the changes.  Before we join the inode
 579	 * to the transaction, if XFS_AT_SIZE is set then take care of
 580	 * the part of the truncation that must be done without the
 581	 * inode lock.  This needs to be done before joining the inode
 582	 * to the transaction, because the inode cannot be unlocked
 583	 * once it is a part of the transaction.
 584	 */
 585	if (mask & XFS_AT_SIZE) {
 586		code = 0;
 587		if ((vap->va_size > ip->i_size) &&
 588		    (flags & ATTR_NOSIZETOK) == 0) {
 589			code = xfs_igrow_start(ip, vap->va_size, credp);
 590		}
 591		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 592
 593		/*
 594		 * We are going to log the inode size change in this
 595		 * transaction so any previous writes that are beyond the on
 596		 * disk EOF and the new EOF that have not been written out need
 597		 * to be written here. If we do not write the data out, we
 598		 * expose ourselves to the null files problem.
 599		 *
 600		 * Only flush from the on disk size to the smaller of the in
 601		 * memory file size or the new size as that's the range we
 602		 * really care about here and prevents waiting for other data
 603		 * not within the range we care about here.
 604		 */
 605		if (!code &&
 606		    (ip->i_size != ip->i_d.di_size) &&
 607		    (vap->va_size > ip->i_d.di_size)) {
 608			code = bhv_vop_flush_pages(XFS_ITOV(ip),
 609					ip->i_d.di_size, vap->va_size,
 610					XFS_B_ASYNC, FI_NONE);
 611		}
 612
 613		/* wait for all I/O to complete */
 614		vn_iowait(vp);
 615
 616		if (!code)
 617			code = xfs_itruncate_data(ip, vap->va_size);
 618		if (code) {
 619			ASSERT(tp == NULL);
 620			lock_flags &= ~XFS_ILOCK_EXCL;
 621			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 622			goto error_return;
 623		}
 624		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 625		if ((code = xfs_trans_reserve(tp, 0,
 626					     XFS_ITRUNCATE_LOG_RES(mp), 0,
 627					     XFS_TRANS_PERM_LOG_RES,
 628					     XFS_ITRUNCATE_LOG_COUNT))) {
 629			xfs_trans_cancel(tp, 0);
 630			if (need_iolock)
 631				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 632			return code;
 633		}
 634		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 635		xfs_ilock(ip, XFS_ILOCK_EXCL);
 636	}
 637
 638	if (tp) {
 639		xfs_trans_ijoin(tp, ip, lock_flags);
 640		xfs_trans_ihold(tp, ip);
 641	}
 642
 643	/* determine whether mandatory locking mode changes */
 644	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 645
 646	/*
 647	 * Truncate file.  Must have write permission and not be a directory.
 648	 */
 649	if (mask & XFS_AT_SIZE) {
 650		if (vap->va_size > ip->i_size) {
 651			xfs_igrow_finish(tp, ip, vap->va_size,
 652			    !(flags & ATTR_DMI));
 653		} else if ((vap->va_size <= ip->i_size) ||
 654			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 655			/*
 656			 * signal a sync transaction unless
 657			 * we're truncating an already unlinked
 658			 * file on a wsync filesystem
 659			 */
 660			code = xfs_itruncate_finish(&tp, ip,
 661					    (xfs_fsize_t)vap->va_size,
 662					    XFS_DATA_FORK,
 663					    ((ip->i_d.di_nlink != 0 ||
 664					      !(mp->m_flags & XFS_MOUNT_WSYNC))
 665					     ? 1 : 0));
 666			if (code)
 667				goto abort_return;
 668			/*
 669			 * Truncated "down", so we're removing references
 670			 * to old data here - if we now delay flushing for
 671			 * a long time, we expose ourselves unduly to the
 672			 * notorious NULL files problem.  So, we mark this
 673			 * vnode and flush it when the file is closed, and
 674			 * do not wait the usual (long) time for writeout.
 675			 */
 676			VTRUNCATE(vp);
 677		}
 678		/*
 679		 * Have to do this even if the file's size doesn't change.
 680		 */
 681		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 682	}
 683
 684	/*
 685	 * Change file access modes.
 686	 */
 687	if (mask & XFS_AT_MODE) {
 688		ip->i_d.di_mode &= S_IFMT;
 689		ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 690
 691		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 692		timeflags |= XFS_ICHGTIME_CHG;
 693	}
 694
 695	/*
 696	 * Change file ownership.  Must be the owner or privileged.
 697	 * If the system was configured with the "restricted_chown"
 698	 * option, the owner is not permitted to give away the file,
 699	 * and can change the group id only to a group of which he
 700	 * or she is a member.
 701	 */
 702	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 703		/*
 704		 * CAP_FSETID overrides the following restrictions:
 705		 *
 706		 * The set-user-ID and set-group-ID bits of a file will be
 707		 * cleared upon successful return from chown()
 708		 */
 709		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 710		    !capable(CAP_FSETID)) {
 711			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 712		}
 713
 714		/*
 715		 * Change the ownerships and register quota modifications
 716		 * in the transaction.
 717		 */
 718		if (iuid != uid) {
 719			if (XFS_IS_UQUOTA_ON(mp)) {
 720				ASSERT(mask & XFS_AT_UID);
 721				ASSERT(udqp);
 722				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 723							&ip->i_udquot, udqp);
 724			}
 725			ip->i_d.di_uid = uid;
 726		}
 727		if (igid != gid) {
 728			if (XFS_IS_GQUOTA_ON(mp)) {
 729				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 730				ASSERT(mask & XFS_AT_GID);
 731				ASSERT(gdqp);
 732				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 733							&ip->i_gdquot, gdqp);
 734			}
 735			ip->i_d.di_gid = gid;
 736		}
 737		if (iprojid != projid) {
 738			if (XFS_IS_PQUOTA_ON(mp)) {
 739				ASSERT(!XFS_IS_GQUOTA_ON(mp));
 740				ASSERT(mask & XFS_AT_PROJID);
 741				ASSERT(gdqp);
 742				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 743							&ip->i_gdquot, gdqp);
 744			}
 745			ip->i_d.di_projid = projid;
 746			/*
 747			 * We may have to rev the inode as well as
 748			 * the superblock version number since projids didn't
 749			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 750			 */
 751			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 752				xfs_bump_ino_vers2(tp, ip);
 753		}
 754
 755		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 756		timeflags |= XFS_ICHGTIME_CHG;
 757	}
 758
 759
 760	/*
 761	 * Change file access or modified times.
 762	 */
 763	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 764		if (mask & XFS_AT_ATIME) {
 765			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 766			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 767			ip->i_update_core = 1;
 768			timeflags &= ~XFS_ICHGTIME_ACC;
 769		}
 770		if (mask & XFS_AT_MTIME) {
 771			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 772			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 773			timeflags &= ~XFS_ICHGTIME_MOD;
 774			timeflags |= XFS_ICHGTIME_CHG;
 775		}
 776		if (tp && (flags & ATTR_UTIME))
 777			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 778	}
 779
 780	/*
 781	 * Change XFS-added attributes.
 782	 */
 783	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 784		if (mask & XFS_AT_EXTSIZE) {
 785			/*
 786			 * Converting bytes to fs blocks.
 787			 */
 788			ip->i_d.di_extsize = vap->va_extsize >>
 789				mp->m_sb.sb_blocklog;
 790		}
 791		if (mask & XFS_AT_XFLAGS) {
 792			uint	di_flags;
 793
 794			/* can't set PREALLOC this way, just preserve it */
 795			di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 796			if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 797				di_flags |= XFS_DIFLAG_IMMUTABLE;
 798			if (vap->va_xflags & XFS_XFLAG_APPEND)
 799				di_flags |= XFS_DIFLAG_APPEND;
 800			if (vap->va_xflags & XFS_XFLAG_SYNC)
 801				di_flags |= XFS_DIFLAG_SYNC;
 802			if (vap->va_xflags & XFS_XFLAG_NOATIME)
 803				di_flags |= XFS_DIFLAG_NOATIME;
 804			if (vap->va_xflags & XFS_XFLAG_NODUMP)
 805				di_flags |= XFS_DIFLAG_NODUMP;
 806			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 807				di_flags |= XFS_DIFLAG_PROJINHERIT;
 808			if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 809				di_flags |= XFS_DIFLAG_NODEFRAG;
 810			if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 811				di_flags |= XFS_DIFLAG_FILESTREAM;
 812			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 813				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 814					di_flags |= XFS_DIFLAG_RTINHERIT;
 815				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 816					di_flags |= XFS_DIFLAG_NOSYMLINKS;
 817				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 818					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 819			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 820				if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 821					di_flags |= XFS_DIFLAG_REALTIME;
 822					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 823				} else {
 824					ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 825				}
 826				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 827					di_flags |= XFS_DIFLAG_EXTSIZE;
 828			}
 829			ip->i_d.di_flags = di_flags;
 830		}
 831		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 832		timeflags |= XFS_ICHGTIME_CHG;
 833	}
 834
 835	/*
 836	 * Change file inode change time only if XFS_AT_CTIME set
 837	 * AND we have been called by a DMI function.
 838	 */
 839
 840	if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 841		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 842		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 843		ip->i_update_core = 1;
 844		timeflags &= ~XFS_ICHGTIME_CHG;
 845	}
 846
 847	/*
 848	 * Send out timestamp changes that need to be set to the
 849	 * current time.  Not done when called by a DMI function.
 850	 */
 851	if (timeflags && !(flags & ATTR_DMI))
 852		xfs_ichgtime(ip, timeflags);
 853
 854	XFS_STATS_INC(xs_ig_attrchg);
 855
 856	/*
 857	 * If this is a synchronous mount, make sure that the
 858	 * transaction goes to disk before returning to the user.
 859	 * This is slightly sub-optimal in that truncates require
 860	 * two sync transactions instead of one for wsync filesystems.
 861	 * One for the truncate and one for the timestamps since we
 862	 * don't want to change the timestamps unless we're sure the
 863	 * truncate worked.  Truncates are less than 1% of the laddis
 864	 * mix so this probably isn't worth the trouble to optimize.
 865	 */
 866	code = 0;
 867	if (tp) {
 868		if (mp->m_flags & XFS_MOUNT_WSYNC)
 869			xfs_trans_set_sync(tp);
 870
 871		code = xfs_trans_commit(tp, commit_flags);
 872	}
 873
 874	/*
 875	 * If the (regular) file's mandatory locking mode changed, then
 876	 * notify the vnode.  We do this under the inode lock to prevent
 877	 * racing calls to vop_vnode_change.
 878	 */
 879	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 880	if (mandlock_before != mandlock_after) {
 881		bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 882				 mandlock_after);
 883	}
 884
 885	xfs_iunlock(ip, lock_flags);
 886
 887	/*
 888	 * Release any dquot(s) the inode had kept before chown.
 889	 */
 890	XFS_QM_DQRELE(mp, olddquot1);
 891	XFS_QM_DQRELE(mp, olddquot2);
 892	XFS_QM_DQRELE(mp, udqp);
 893	XFS_QM_DQRELE(mp, gdqp);
 894
 895	if (code) {
 896		return code;
 897	}
 898
 899	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
 900	    !(flags & ATTR_DMI)) {
 901		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 902					NULL, DM_RIGHT_NULL, NULL, NULL,
 903					0, 0, AT_DELAY_FLAG(flags));
 904	}
 905	return 0;
 906
 907 abort_return:
 908	commit_flags |= XFS_TRANS_ABORT;
 909	/* FALLTHROUGH */
 910 error_return:
 911	XFS_QM_DQRELE(mp, udqp);
 912	XFS_QM_DQRELE(mp, gdqp);
 913	if (tp) {
 914		xfs_trans_cancel(tp, commit_flags);
 915	}
 916	if (lock_flags != 0) {
 917		xfs_iunlock(ip, lock_flags);
 918	}
 919	return code;
 920}
 921
 922
 923/*
 924 * xfs_access
 925 * Null conversion from vnode mode bits to inode mode bits, as in efs.
 926 */
 927STATIC int
 928xfs_access(
 929	bhv_desc_t	*bdp,
 930	int		mode,
 931	cred_t		*credp)
 932{
 933	xfs_inode_t	*ip;
 934	int		error;
 935
 936	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 937					       (inst_t *)__return_address);
 938
 939	ip = XFS_BHVTOI(bdp);
 940	xfs_ilock(ip, XFS_ILOCK_SHARED);
 941	error = xfs_iaccess(ip, mode, credp);
 942	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 943	return error;
 944}
 945
 946
 947/*
 948 * The maximum pathlen is 1024 bytes. Since the minimum file system
 949 * blocksize is 512 bytes, we can get a max of 2 extents back from
 950 * bmapi.
 951 */
 952#define SYMLINK_MAPS 2
 953
 954/*
 955 * xfs_readlink
 956 *
 957 */
 958STATIC int
 959xfs_readlink(
 960	bhv_desc_t	*bdp,
 961	uio_t		*uiop,
 962	int		ioflags,
 963	cred_t		*credp)
 964{
 965	xfs_inode_t     *ip;
 966	int		count;
 967	xfs_off_t	offset;
 968	int		pathlen;
 969	bhv_vnode_t	*vp;
 970	int		error = 0;
 971	xfs_mount_t	*mp;
 972	int             nmaps;
 973	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 974	xfs_daddr_t	d;
 975	int		byte_cnt;
 976	int		n;
 977	xfs_buf_t	*bp;
 978
 979	vp = BHV_TO_VNODE(bdp);
 980	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 981
 982	ip = XFS_BHVTOI(bdp);
 983	mp = ip->i_mount;
 984
 985	if (XFS_FORCED_SHUTDOWN(mp))
 986		return XFS_ERROR(EIO);
 987
 988	xfs_ilock(ip, XFS_ILOCK_SHARED);
 989
 990	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 991
 992	offset = uiop->uio_offset;
 993	count = uiop->uio_resid;
 994
 995	if (offset < 0) {
 996		error = XFS_ERROR(EINVAL);
 997		goto error_return;
 998	}
 999	if (count <= 0) {
1000		error = 0;
1001		goto error_return;
1002	}
1003
1004	/*
1005	 * See if the symlink is stored inline.
1006	 */
1007	pathlen = (int)ip->i_d.di_size;
1008
1009	if (ip->i_df.if_flags & XFS_IFINLINE) {
1010		error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1011	}
1012	else {
1013		/*
1014		 * Symlink not inline.  Call bmap to get it in.
1015		 */
1016		nmaps = SYMLINK_MAPS;
1017
1018		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1019				  0, NULL, 0, mval, &nmaps, NULL, NULL);
1020
1021		if (error) {
1022			goto error_return;
1023		}
1024
1025		for (n = 0; n < nmaps; n++) {
1026			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1027			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1028			bp = xfs_buf_read(mp->m_ddev_targp, d,
1029				      BTOBB(byte_cnt), 0);
1030			error = XFS_BUF_GETERROR(bp);
1031			if (error) {
1032				xfs_ioerror_alert("xfs_readlink",
1033					  ip->i_mount, bp, XFS_BUF_ADDR(bp));
1034				xfs_buf_relse(bp);
1035				goto error_return;
1036			}
1037			if (pathlen < byte_cnt)
1038				byte_cnt = pathlen;
1039			pathlen -= byte_cnt;
1040
1041			error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1042			xfs_buf_relse (bp);
1043		}
1044
1045	}
1046
1047error_return:
1048	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1049	return error;
1050}
1051
1052
1053/*
1054 * xfs_fsync
1055 *
1056 * This is called to sync the inode and its data out to disk.
1057 * We need to hold the I/O lock while flushing the data, and
1058 * the inode lock while flushing the inode.  The inode lock CANNOT
1059 * be held while flushing the data, so acquire after we're done
1060 * with that.
1061 */
1062STATIC int
1063xfs_fsync(
1064	bhv_desc_t	*bdp,
1065	int		flag,
1066	cred_t		*credp,
1067	xfs_off_t	start,
1068	xfs_off_t	stop)
1069{
1070	xfs_inode_t	*ip;
1071	xfs_trans_t	*tp;
1072	int		error;
1073	int		log_flushed = 0, changed = 1;
1074
1075	vn_trace_entry(BHV_TO_VNODE(bdp),
1076			__FUNCTION__, (inst_t *)__return_address);
1077
1078	ip = XFS_BHVTOI(bdp);
1079
1080	ASSERT(start >= 0 && stop >= -1);
1081
1082	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1083		return XFS_ERROR(EIO);
1084
1085	/*
1086	 * We always need to make sure that the required inode state
1087	 * is safe on disk.  The vnode might be clean but because
1088	 * of committed transactions that haven't hit the disk yet.
1089	 * Likewise, there could be unflushed non-transactional
1090	 * changes to the inode core that have to go to disk.
1091	 *
1092	 * The following code depends on one assumption:  that
1093	 * any transaction that changes an inode logs the core
1094	 * because it has to change some field in the inode core
1095	 * (typically nextents or nblocks).  That assumption
1096	 * implies that any transactions against an inode will
1097	 * catch any non-transactional updates.  If inode-altering
1098	 * transactions exist that violate this assumption, the
1099	 * code breaks.  Right now, it figures that if the involved
1100	 * update_* field is clear and the inode is unpinned, the
1101	 * inode is clean.  Either it's been flushed or it's been
1102	 * committed and the commit has hit the disk unpinning the inode.
1103	 * (Note that xfs_inode_item_format() called at commit clears
1104	 * the update_* fields.)
1105	 */
1106	xfs_ilock(ip, XFS_ILOCK_SHARED);
1107
1108	/* If we are flushing data then we care about update_size
1109	 * being set, otherwise we care about update_core
1110	 */
1111	if ((flag & FSYNC_DATA) ?
1112			(ip->i_update_size == 0) :
1113			(ip->i_update_core == 0)) {
1114		/*
1115		 * Timestamps/size haven't changed since last inode
1116		 * flush or inode transaction commit.  That means
1117		 * either nothing got written or a transaction
1118		 * committed which caught the updates.	If the
1119		 * latter happened and the transaction hasn't
1120		 * hit the disk yet, the inode will be still
1121		 * be pinned.  If it is, force the log.
1122		 */
1123
1124		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1125
1126		if (xfs_ipincount(ip)) {
1127			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1128				      XFS_LOG_FORCE |
1129				      ((flag & FSYNC_WAIT)
1130				       ? XFS_LOG_SYNC : 0),
1131				      &log_flushed);
1132		} else {
1133			/*
1134			 * If the inode is not pinned and nothing
1135			 * has changed we don't need to flush the
1136			 * cache.
1137			 */
1138			changed = 0;
1139		}
1140		error = 0;
1141	} else	{
1142		/*
1143		 * Kick off a transaction to log the inode
1144		 * core to get the updates.  Make it
1145		 * sync if FSYNC_WAIT is passed in (which
1146		 * is done by everybody but specfs).  The
1147		 * sync transaction will also force the log.
1148		 */
1149		xfs_iunlock(ip, XFS_ILOCK_SHARED);
1150		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1151		if ((error = xfs_trans_reserve(tp, 0,
1152				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1153				0, 0, 0)))  {
1154			xfs_trans_cancel(tp, 0);
1155			return error;
1156		}
1157		xfs_ilock(ip, XFS_ILOCK_EXCL);
1158
1159		/*
1160		 * Note - it's possible that we might have pushed
1161		 * ourselves out of the way during trans_reserve
1162		 * which would flush the inode.	 But there's no
1163		 * guarantee that the inode buffer has actually
1164		 * gone out yet (it's delwri).	Plus the buffer
1165		 * could be pinned anyway if it's part of an
1166		 * inode in another recent transaction.	 So we
1167		 * play it safe and fire off the transaction anyway.
1168		 */
1169		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1170		xfs_trans_ihold(tp, ip);
1171		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1172		if (flag & FSYNC_WAIT)
1173			xfs_trans_set_sync(tp);
1174		error = _xfs_trans_commit(tp, 0, &log_flushed);
1175
1176		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1177	}
1178
1179	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1180		/*
1181		 * If the log write didn't issue an ordered tag we need
1182		 * to flush the disk cache for the data device now.
1183		 */
1184		if (!log_flushed)
1185			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1186
1187		/*
1188		 * If this inode is on the RT dev we need to flush that
1189		 * cache as well.
1190		 */
1191		if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1192			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1193	}
1194
1195	return error;
1196}
1197
1198/*
1199 * This is called by xfs_inactive to free any blocks beyond eof
1200 * when the link count isn't zero and by xfs_dm_punch_hole() when
1201 * punching a hole to EOF.
1202 */
1203int
1204xfs_free_eofblocks(
1205	xfs_mount_t	*mp,
1206	xfs_inode_t	*ip,
1207	int		flags)
1208{
1209	xfs_trans_t	*tp;
1210	int		error;
1211	xfs_fileoff_t	end_fsb;
1212	xfs_fileoff_t	last_fsb;
1213	xfs_filblks_t	map_len;
1214	int		nimaps;
1215	xfs_bmbt_irec_t	imap;
1216	int		use_iolock = (flags & XFS_FREE_EOF_LOCK);
1217
1218	/*
1219	 * Figure out if there are any blocks beyond the end
1220	 * of the file.  If not, then there is nothing to do.
1221	 */
1222	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1223	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1224	map_len = last_fsb - end_fsb;
1225	if (map_len <= 0)
1226		return 0;
1227
1228	nimaps = 1;
1229	xfs_ilock(ip, XFS_ILOCK_SHARED);
1230	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1231			  NULL, 0, &imap, &nimaps, NULL, NULL);
1232	xfs_iunlock(ip, XFS_ILOCK_SHARED);
1233
1234	if (!error && (nimaps != 0) &&
1235	    (imap.br_startblock != HOLESTARTBLOCK ||
1236	     ip->i_delayed_blks)) {
1237		/*
1238		 * Attach the dquots to the inode up front.
1239		 */
1240		if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1241			return error;
1242
1243		/*
1244		 * There are blocks after the end of file.
1245		 * Free them up now by truncating the file to
1246		 * its current size.
1247		 */
1248		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1249
1250		/*
1251		 * Do the xfs_itruncate_start() call before
1252		 * reserving any log space because
1253		 * itruncate_start will call into the buffer
1254		 * cache and we can't
1255		 * do that within a transaction.
1256		 */
1257		if (use_iolock)
1258			xfs_ilock(ip, XFS_IOLOCK_EXCL);
1259		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1260				    ip->i_size);
1261		if (error) {
1262			xfs_trans_cancel(tp, 0);
1263			if (use_iolock)
1264				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1265			return error;
1266		}
1267
1268		error = xfs_trans_reserve(tp, 0,
1269					  XFS_ITRUNCATE_LOG_RES(mp),
1270					  0, XFS_TRANS_PERM_LOG_RES,
1271					  XFS_ITRUNCATE_LOG_COUNT);
1272		if (error) {
1273			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1274			xfs_trans_cancel(tp, 0);
1275			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1276			return error;
1277		}
1278
1279		xfs_ilock(ip, XFS_ILOCK_EXCL);
1280		xfs_trans_ijoin(tp, ip,
1281				XFS_IOLOCK_EXCL |
1282				XFS_ILOCK_EXCL);
1283		xfs_trans_ihold(tp, ip);
1284
1285		error = xfs_itruncate_finish(&tp, ip,
1286					     ip->i_size,
1287					     XFS_DATA_FORK,
1288					     0);
1289		/*
1290		 * If we get an error at this point we
1291		 * simply don't bother truncating the file.
1292		 */
1293		if (error) {
1294			xfs_trans_cancel(tp,
1295					 (XFS_TRANS_RELEASE_LOG_RES |
1296					  XFS_TRANS_ABORT));
1297		} else {
1298			error = xfs_trans_commit(tp,
1299						XFS_TRANS_RELEASE_LOG_RES);
1300		}
1301		xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1302					    : XFS_ILOCK_EXCL));
1303	}
1304	return error;
1305}
1306
1307/*
1308 * Free a symlink that has blocks associated with it.
1309 */
1310STATIC int
1311xfs_inactive_symlink_rmt(
1312	xfs_inode_t	*ip,
1313	xfs_trans_t	**tpp)
1314{
1315	xfs_buf_t	*bp;
1316	int		committed;
1317	int		done;
1318	int		error;
1319	xfs_fsblock_t	first_block;
1320	xfs_bmap_free_t	free_list;
1321	int		i;
1322	xfs_mount_t	*mp;
1323	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
1324	int		nmaps;
1325	xfs_trans_t	*ntp;
1326	int		size;
1327	xfs_trans_t	*tp;
1328
1329	tp = *tpp;
1330	mp = ip->i_mount;
1331	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1332	/*
1333	 * We're freeing a symlink that has some
1334	 * blocks allocated to it.  Free the
1335	 * blocks here.  We know that we've got
1336	 * either 1 or 2 extents and that we can
1337	 * free them all in one bunmapi call.
1338	 */
1339	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1340	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1341			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1342		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1343		xfs_trans_cancel(tp, 0);
1344		*tpp = NULL;
1345		return error;
1346	}
1347	/*
1348	 * Lock the inode, fix the size, and join it to the transaction.
1349	 * Hold it so in the normal path, we still have it locked for
1350	 * the second transaction.  In the error paths we need it
1351	 * held so the cancel won't rele it, see below.
1352	 */
1353	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1354	size = (int)ip->i_d.di_size;
1355	ip->i_d.di_size = 0;
1356	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1357	xfs_trans_ihold(tp, ip);
1358	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1359	/*
1360	 * Find the block(s) so we can inval and unmap them.
1361	 */
1362	done = 0;
1363	XFS_BMAP_INIT(&free_list, &first_block);
1364	nmaps = ARRAY_SIZE(mval);
1365	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1366			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1367			&free_list, NULL)))
1368		goto error0;
1369	/*
1370	 * Invalidate the block(s).
1371	 */
1372	for (i = 0; i < nmaps; i++) {
1373		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1374			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1375			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1376		xfs_trans_binval(tp, bp);
1377	}
1378	/*
1379	 * Unmap the dead block(s) to the free_list.
1380	 */
1381	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1382			&first_block, &free_list, NULL, &done)))
1383		goto error1;
1384	ASSERT(done);
1385	/*
1386	 * Commit the first transaction.  This logs the EFI and the inode.
1387	 */
1388	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1389		goto error1;
1390	/*
1391	 * The transaction must have been committed, since there were
1392	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1393	 * The new tp has the extent freeing and EFDs.
1394	 */
1395	ASSERT(committed);
1396	/*
1397	 * The first xact was committed, so add the inode to the new one.
1398	 * Mark it dirty so it will be logged and moved forward in the log as
1399	 * part of every commit.
1400	 */
1401	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1402	xfs_trans_ihold(tp, ip);
1403	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1404	/*
1405	 * Get a new, empty transaction to return to our caller.
1406	 */
1407	ntp = xfs_trans_dup(tp);
1408	/*
1409	 * Commit the transaction containing extent freeing and EFDs.
1410	 * If we get an error on the commit here or on the reserve below,
1411	 * we need to unlock the inode since the new transaction doesn't
1412	 * have the inode attached.
1413	 */
1414	error = xfs_trans_commit(tp, 0);
1415	tp = ntp;
1416	if (error) {
1417		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1418		goto error0;
1419	}
1420	/*
1421	 * Remove the memory for extent descriptions (just bookkeeping).
1422	 */
1423	if (ip->i_df.if_bytes)
1424		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1425	ASSERT(ip->i_df.if_bytes == 0);
1426	/*
1427	 * Put an itruncate log reservation in the new transaction
1428	 * for our caller.
1429	 */
1430	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1431			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1432		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1433		goto error0;
1434	}
1435	/*
1436	 * Return with the inode locked but not joined to the transaction.
1437	 */
1438	*tpp = tp;
1439	return 0;
1440
1441 error1:
1442	xfs_bmap_cancel(&free_list);
1443 error0:
1444	/*
1445	 * Have to come here with the inode locked and either
1446	 * (held and in the transaction) or (not in the transaction).
1447	 * If the inode isn't held then cancel would iput it, but
1448	 * that's wrong since this is inactive and the vnode ref
1449	 * count is 0 already.
1450	 * Cancel won't do anything to the inode if held, but it still
1451	 * needs to be locked until the cancel is done, if it was
1452	 * joined to the transaction.
1453	 */
1454	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1455	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1456	*tpp = NULL;
1457	return error;
1458
1459}
1460
1461STATIC int
1462xfs_inactive_symlink_local(
1463	xfs_inode_t	*ip,
1464	xfs_trans_t	**tpp)
1465{
1466	int		error;
1467
1468	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1469	/*
1470	 * We're freeing a symlink which fit into
1471	 * the inode.  Just free the memory used
1472	 * to hold the old symlink.
1473	 */
1474	error = xfs_trans_reserve(*tpp, 0,
1475				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1476				  0, XFS_TRANS_PERM_LOG_RES,
1477				  XFS_ITRUNCATE_LOG_COUNT);
1478
1479	if (error) {
1480		xfs_trans_cancel(*tpp, 0);
1481		*tpp = NULL;
1482		return error;
1483	}
1484	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1485
1486	/*
1487	 * Zero length symlinks _can_ exist.
1488	 */
1489	if (ip->i_df.if_bytes > 0) {
1490		xfs_idata_realloc(ip,
1491				  -(ip->i_df.if_bytes),
1492				  XFS_DATA_FORK);
1493		ASSERT(ip->i_df.if_bytes == 0);
1494	}
1495	return 0;
1496}
1497
1498STATIC int
1499xfs_inactive_attrs(
1500	xfs_inode_t	*ip,
1501	xfs_trans_t	**tpp)
1502{
1503	xfs_trans_t	*tp;
1504	int		error;
1505	xfs_mount_t	*mp;
1506
1507	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1508	tp = *tpp;
1509	mp = ip->i_mount;
1510	ASSERT(ip->i_d.di_forkoff != 0);
1511	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1512	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1513
1514	error = xfs_attr_inactive(ip);
1515	if (error) {
1516		*tpp = NULL;
1517		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1518		return error; /* goto out */
1519	}
1520
1521	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1522	error = xfs_trans_reserve(tp, 0,
1523				  XFS_IFREE_LOG_RES(mp),
1524				  0, XFS_TRANS_PERM_LOG_RES,
1525				  XFS_INACTIVE_LOG_COUNT);
1526	if (error) {
1527		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1528		xfs_trans_cancel(tp, 0);
1529		*tpp = NULL;
1530		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1531		return error;
1532	}
1533
1534	xfs_ilock(ip, XFS_ILOCK_EXCL);
1535	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1536	xfs_trans_ihold(tp, ip);
1537	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1538
1539	ASSERT(ip->i_d.di_anextents == 0);
1540
1541	*tpp = tp;
1542	return 0;
1543}
1544
1545STATIC int
1546xfs_release(
1547	bhv_desc_t	*bdp)
1548{
1549	xfs_inode_t	*ip;
1550	bhv_vnode_t	*vp;
1551	xfs_mount_t	*mp;
1552	int		error;
1553
1554	vp = BHV_TO_VNODE(bdp);
1555	ip = XFS_BHVTOI(bdp);
1556	mp = ip->i_mount;
1557
1558	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1559		return 0;
1560
1561	/* If this is a read-only mount, don't do this (would generate I/O) */
1562	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1563		return 0;
1564
1565	if (!XFS_FORCED_SHUTDOWN(mp)) {
1566		/*
1567		 * If we are using filestreams, and we have an unlinked
1568		 * file that we are processing the last close on, then nothing
1569		 * will be able to reopen and write to this file. Purge this
1570		 * inode from the filestreams cache so that it doesn't delay
1571		 * teardown of the inode.
1572		 */
1573		if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1574			xfs_filestream_deassociate(ip);
1575
1576		/*
1577		 * If we previously truncated this file and removed old data
1578		 * in the process, we want to initiate "early" writeout on
1579		 * the last close.  This is an attempt to combat the notorious
1580		 * NULL files problem which is particularly noticable from a
1581		 * truncate down, buffered (re-)write (delalloc), followed by
1582		 * a crash.  What we are effectively doing here is
1583		 * significantly reducing the time window where we'd otherwise
1584		 * be exposed to that problem.
1585		 */
1586		if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1587			bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1588	}
1589
1590#ifdef HAVE_REFCACHE
1591	/* If we are in the NFS reference cache then don't do this now */
1592	if (ip->i_refcache)
1593		return 0;
1594#endif
1595
1596	if (ip->i_d.di_nlink != 0) {
1597		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1598		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1599		       ip->i_delayed_blks > 0)) &&
1600		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1601		    (!(ip->i_d.di_flags &
1602				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1603			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1604			if (error)
1605				return error;
1606			/* Update linux inode block count after free above */
1607			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1608				ip->i_d.di_nblocks + ip->i_delayed_blks);
1609		}
1610	}
1611
1612	return 0;
1613}
1614
1615/*
1616 * xfs_inactive
1617 *
1618 * This is called when the vnode reference count for the vnode
1619 * goes to zero.  If the file has been unlinked, then it must
1620 * now be truncated.  Also, we clear all of the read-ahead state
1621 * kept for the inode here since the file is now closed.
1622 */
1623STATIC int
1624xfs_inactive(
1625	bhv_desc_t	*bdp,
1626	cred_t		*credp)
1627{
1628	xfs_inode_t	*ip;
1629	bhv_vnode_t	*vp;
1630	xfs_bmap_free_t	free_list;
1631	xfs_fsblock_t	first_block;
1632	int		committed;
1633	xfs_trans_t	*tp;
1634	xfs_mount_t	*mp;
1635	int		error;
1636	int		truncate;
1637
1638	vp = BHV_TO_VNODE(bdp);
1639	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1640
1641	ip = XFS_BHVTOI(bdp);
1642
1643	/*
1644	 * If the inode is already free, then there can be nothing
1645	 * to clean up here.
1646	 */
1647	if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1648		ASSERT(ip->i_df.if_real_bytes == 0);
1649		ASSERT(ip->i_df.if_broot_bytes == 0);
1650		return VN_INACTIVE_CACHE;
1651	}
1652
1653	/*
1654	 * Only do a truncate if it's a regular file with
1655	 * some actual space in it.  It's OK to look at the
1656	 * inode's fields without the lock because we're the
1657	 * only one with a reference to the inode.
1658	 */
1659	truncate = ((ip->i_d.di_nlink == 0) &&
1660	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1661	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1662	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1663
1664	mp = ip->i_mount;
1665
1666	if (ip->i_d.di_nlink == 0 &&
1667	    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1668		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1669	}
1670
1671	error = 0;
1672
1673	/* If this is a read-only mount, don't do this (would generate I/O) */
1674	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1675		goto out;
1676
1677	if (ip->i_d.di_nlink != 0) {
1678		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1679                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1680                       ip->i_delayed_blks > 0)) &&
1681		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1682		     (!(ip->i_d.di_flags &
1683				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1684		      (ip->i_delayed_blks != 0)))) {
1685			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1686			if (error)
1687				return VN_INACTIVE_CACHE;
1688			/* Update linux inode block count after free above */
1689			vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1690				ip->i_d.di_nblocks + ip->i_delayed_blks);
1691		}
1692		goto out;
1693	}
1694
1695	ASSERT(ip->i_d.di_nlink == 0);
1696
1697	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1698		return VN_INACTIVE_CACHE;
1699
1700	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1701	if (truncate) {
1702		/*
1703		 * Do the xfs_itruncate_start() call before
1704		 * reserving any log space because itruncate_start
1705		 * will call into the buffer cache and we can't
1706		 * do that within a transaction.
1707		 */
1708		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1709
1710		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1711		if (error) {
1712			xfs_trans_cancel(tp, 0);
1713			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1714			return VN_INACTIVE_CACHE;
1715		}
1716
1717		error = xfs_trans_reserve(tp, 0,
1718					  XFS_ITRUNCATE_LOG_RES(mp),
1719					  0, XFS_TRANS_PERM_LOG_RES,
1720					  XFS_ITRUNCATE_LOG_COUNT);
1721		if (error) {
1722			/* Don't call itruncate_cleanup */
1723			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1724			xfs_trans_cancel(tp, 0);
1725			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1726			return VN_INACTIVE_CACHE;
1727		}
1728
1729		xfs_ilock(ip, XFS_ILOCK_EXCL);
1730		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1731		xfs_trans_ihold(tp, ip);
1732
1733		/*
1734		 * normally, we have to run xfs_itruncate_finish sync.
1735		 * But if filesystem is wsync and we're in the inactive
1736		 * path, then we know that nlink == 0, and that the
1737		 * xaction that made nlink == 0 is permanently committed
1738		 * since xfs_remove runs as a synchronous transaction.
1739		 */
1740		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1741				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1742
1743		if (error) {
1744			xfs_trans_cancel(tp,
1745				XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1746			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1747			return VN_INACTIVE_CACHE;
1748		}
1749	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1750
1751		/*
1752		 * If we get an error while cleaning up a
1753		 * symlink we bail out.
1754		 */
1755		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1756			xfs_inactive_symlink_rmt(ip, &tp) :
1757			xfs_inactive_symlink_local(ip, &tp);
1758
1759		if (error) {
1760			ASSERT(tp == NULL);
1761			return VN_INACTIVE_CACHE;
1762		}
1763
1764		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1765		xfs_trans_ihold(tp, ip);
1766	} else {
1767		error = xfs_trans_reserve(tp, 0,
1768					  XFS_IFREE_LOG_RES(mp),
1769					  0, XFS_TRANS_PERM_LOG_RES,
1770					  XFS_INACTIVE_LOG_COUNT);
1771		if (error) {
1772			ASSERT(XFS_FORCED_SHUTDOWN(mp));
1773			xfs_trans_cancel(tp, 0);
1774			return VN_INACTIVE_CACHE;
1775		}
1776
1777		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1778		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1779		xfs_trans_ihold(tp, ip);
1780	}
1781
1782	/*
1783	 * If there are attributes associated with the file
1784	 * then blow them away now.  The code calls a routine
1785	 * that recursively deconstructs the attribute fork.
1786	 * We need to just commit the current transaction
1787	 * because we can't use it for xfs_attr_inactive().
1788	 */
1789	if (ip->i_d.di_anextents > 0) {
1790		error = xfs_inactive_attrs(ip, &tp);
1791		/*
1792		 * If we got an error, the transaction is already
1793		 * cancelled, and the inode is unlocked. Just get out.
1794		 */
1795		 if (error)
1796			 return VN_INACTIVE_CACHE;
1797	} else if (ip->i_afp) {
1798		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1799	}
1800
1801	/*
1802	 * Free the inode.
1803	 */
1804	XFS_BMAP_INIT(&free_list, &first_block);
1805	error = xfs_ifree(tp, ip, &free_list);
1806	if (error) {
1807		/*
1808		 * If we fail to free the inode, shut down.  The cancel
1809		 * might do that, we need to make sure.  Otherwise the
1810		 * inode might be lost for a long time or forever.
1811		 */
1812		if (!XFS_FORCED_SHUTDOWN(mp)) {
1813			cmn_err(CE_NOTE,
1814		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
1815				error, mp->m_fsname);
1816			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1817		}
1818		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1819	} else {
1820		/*
1821		 * Credit the quota account(s). The inode is gone.
1822		 */
1823		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1824
1825		/*
1826		 * Just ignore errors at this point.  There is
1827		 * nothing we can do except to try to keep going.
1828		 */
1829		(void) xfs_bmap_finish(&tp,  &free_list, &committed);
1830		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1831	}
1832	/*
1833	 * Release the dquots held by inode, if any.
1834	 */
1835	XFS_QM_DQDETACH(mp, ip);
1836
1837	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1838
1839 out:
1840	return VN_INACTIVE_CACHE;
1841}
1842
1843
1844/*
1845 * xfs_lookup
1846 */
1847STATIC int
1848xfs_lookup(
1849	bhv_desc_t		*dir_bdp,
1850	bhv_vname_t		*dentry,
1851	bhv_vnode_t		**vpp,
1852	int			flags,
1853	bhv_vnode_t		*rdir,
1854	cred_t			*credp)
1855{
1856	xfs_inode_t		*dp, *ip;
1857	xfs_ino_t		e_inum;
1858	int			error;
1859	uint			lock_mode;
1860	bhv_vnode_t		*dir_vp;
1861
1862	dir_vp = BHV_TO_VNODE(dir_bdp);
1863	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1864
1865	dp = XFS_BHVTOI(dir_bdp);
1866
1867	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1868		return XFS_ERROR(EIO);
1869
1870	lock_mode = xfs_ilock_map_shared(dp);
1871	error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1872	if (!error) {
1873		*vpp = XFS_ITOV(ip);
1874		ITRACE(ip);
1875	}
1876	xfs_iunlock_map_shared(dp, lock_mode);
1877	return error;
1878}
1879
1880
1881/*
1882 * xfs_create (create a new file).
1883 */
1884STATIC int
1885xfs_create(
1886	bhv_desc_t		*dir_bdp,
1887	bhv_vname_t		*dentry,
1888	bhv_vattr_t		*vap,
1889	bhv_vnode_t		**vpp,
1890	cred_t			*credp)
1891{
1892	char			*name = VNAME(dentry);
1893	bhv_vnode_t		*dir_vp;
1894	xfs_inode_t		*dp, *ip;
1895	bhv_vnode_t	        *vp = NULL;
1896	xfs_trans_t		*tp;
1897	xfs_mount_t	        *mp;
1898	xfs_dev_t		rdev;
1899	int                     error;
1900	xfs_bmap_free_t		free_list;
1901	xfs_fsblock_t		first_block;
1902	boolean_t		dp_joined_to_trans;
1903	int			dm_event_sent = 0;
1904	uint			cancel_flags;
1905	int			committed;
1906	xfs_prid_t		prid;
1907	struct xfs_dquot	*udqp, *gdqp;
1908	uint			resblks;
1909	int			dm_di_mode;
1910	int			namelen;
1911
1912	ASSERT(!*vpp);
1913	dir_vp = BHV_TO_VNODE(dir_bdp);
1914	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1915
1916	dp = XFS_BHVTOI(dir_bdp);
1917	mp = dp->i_mount;
1918
1919	dm_di_mode = vap->va_mode;
1920	namelen = VNAMELEN(dentry);
1921
1922	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1923		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1924				dir_vp, DM_RIGHT_NULL, NULL,
1925				DM_RIGHT_NULL, name, NULL,
1926				dm_di_mode, 0, 0);
1927
1928		if (error)
1929			return error;
1930		dm_event_sent = 1;
1931	}
1932
1933	if (XFS_FORCED_SHUTDOWN(mp))
1934		return XFS_ERROR(EIO);
1935
1936	/* Return through std_return after this point. */
1937
1938	udqp = gdqp = NULL;
1939	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1940		prid = dp->i_d.di_projid;
1941	else if (vap->va_mask & XFS_AT_PROJID)
1942		prid = (xfs_prid_t)vap->va_projid;
1943	else
1944		prid = (xfs_prid_t)dfltprid;
1945
1946	/*
1947	 * Make sure that we have allocated dquot(s) on disk.
1948	 */
1949	error = XFS_QM_DQVOPALLOC(mp, dp,
1950			current_fsuid(credp), current_fsgid(credp), prid,
1951			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1952	if (error)
1953		goto std_return;
1954
1955	ip = NULL;
1956	dp_joined_to_trans = B_FALSE;
1957
1958	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1959	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1960	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1961	/*
1962	 * Initially assume that the file does not exist and
1963	 * reserve the resources for that case.  If that is not
1964	 * the case we'll drop the one we have and get a more
1965	 * appropriate transaction later.
1966	 */
1967	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1968			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1969	if (error == ENOSPC) {
1970		resblks = 0;
1971		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1972				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1973	}
1974	if (error) {
1975		cancel_flags = 0;
1976		dp = NULL;
1977		goto error_return;
1978	}
1979
1980	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1981
1982	XFS_BMAP_INIT(&free_list, &first_block);
1983
1984	ASSERT(ip == NULL);
1985
1986	/*
1987	 * Reserve disk quota and the inode.
1988	 */
1989	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1990	if (error)
1991		goto error_return;
1992
1993	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1994		goto error_return;
1995	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1996	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1997			rdev, credp, prid, resblks > 0,
1998			&ip, &committed);
1999	if (error) {
2000		if (error == ENOSPC)
2001			goto error_return;
2002		goto abort_return;
2003	}
2004	ITRACE(ip);
2005
2006	/*
2007	 * At this point, we've gotten a newly allocated inode.
2008	 * It is locked (and joined to the transaction).
2009	 */
2010
2011	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
2012
2013	/*
2014	 * Now we join the directory inode to the transaction.
2015	 * We do not do it earlier because xfs_dir_ialloc
2016	 * might commit the previous transaction (and release
2017	 * all the locks).
2018	 */
2019
2020	VN_HOLD(dir_vp);
2021	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2022	dp_joined_to_trans = B_TRUE;
2023
2024	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2025					&first_block, &free_list, resblks ?
2026					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2027	if (error) {
2028		ASSERT(error != ENOSPC);
2029		goto abort_return;
2030	}
2031	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2032	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2033
2034	/*
2035	 * If this is a synchronous mount, make sure that the
2036	 * create transaction goes to disk before returning to
2037	 * the user.
2038	 */
2039	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2040		xfs_trans_set_sync(tp);
2041	}
2042
2043	dp->i_gen++;
2044
2045	/*
2046	 * Attach the dquot(s) to the inodes and modify them incore.
2047	 * These ids of the inode couldn't have changed since the new
2048	 * inode has been locked ever since it was created.
2049	 */
2050	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2051
2052	/*
2053	 * xfs_trans_commit normally decrements the vnode ref count
2054	 * when it unlocks the inode. Since we want to return the
2055	 * vnode to the caller, we bump the vnode ref count now.
2056	 */
2057	IHOLD(ip);
2058	vp = XFS_ITOV(ip);
2059
2060	error = xfs_bmap_finish(&tp, &free_list, &committed);
2061	if (error) {
2062		xfs_bmap_cancel(&free_list);
2063		goto abort_rele;
2064	}
2065
2066	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2067	if (error) {
2068		IRELE(ip);
2069		tp = NULL;
2070		goto error_return;
2071	}
2072
2073	XFS_QM_DQRELE(mp, udqp);
2074	XFS_QM_DQRELE(mp, gdqp);
2075
2076	/*
2077	 * Propagate the fact that the vnode changed after the
2078	 * xfs_inode locks have been released.
2079	 */
2080	bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2081
2082	*vpp = vp;
2083
2084	/* Fallthrough to std_return with error = 0  */
2085
2086std_return:
2087	if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2088			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2089							DM_EVENT_POSTCREATE)) {
2090		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2091			dir_vp, DM_RIGHT_NULL,
2092			*vpp ? vp:NULL,
2093			DM_RIGHT_NULL, name, NULL,
2094			dm_di_mode, error, 0);
2095	}
2096	return error;
2097
2098 abort_return:
2099	cancel_flags |= XFS_TRANS_ABORT;
2100	/* FALLTHROUGH */
2101
2102 error_return:
2103	if (tp != NULL)
2104		xfs_trans_cancel(tp, cancel_flags);
2105
2106	if (!dp_joined_to_trans && (dp != NULL))
2107		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2108	XFS_QM_DQRELE(mp, udqp);
2109	XFS_QM_DQRELE(mp, gdqp);
2110
2111	goto std_return;
2112
2113 abort_rele:
2114	/*
2115	 * Wait until after the current transaction is aborted to
2116	 * release the inode.  This prevents recursive transactions
2117	 * and deadlocks from xfs_inactive.
2118	 */
2119	cancel_flags |= XFS_TRANS_ABORT;
2120	xfs_trans_cancel(tp, cancel_flags);
2121	IRELE(ip);
2122
2123	XFS_QM_DQRELE(mp, udqp);
2124	XFS_QM_DQRELE(mp, gdqp);
2125
2126	goto std_return;
2127}
2128
2129#ifdef DEBUG
2130/*
2131 * Some counters to see if (and how often) we are hitting some deadlock
2132 * prevention code paths.
2133 */
2134
2135int xfs_rm_locks;
2136int xfs_rm_lock_delays;
2137int xfs_rm_attempts;
2138#endif
2139
2140/*
2141 * The following routine will lock the inodes associated with the
2142 * directory and the named entry in the directory. The locks are
2143 * acquired in increasing inode number.
2144 *
2145 * If the entry is "..", then only the directory is locked. The
2146 * vnode ref count will still include that from the .. entry in
2147 * this case.
2148 *
2149 * There is a deadlock we need to worry about. If the locked directory is
2150 * in the AIL, it might be blocking up the log. The next inode we lock
2151 * could be already locked by another thread waiting for log space (e.g
2152 * a permanent log reservation with a long running transaction (see
2153 * xfs_itruncate_finish)). To solve this, we must check if the directory
2154 * is in the ail and use lock_nowait. If we can't lock, we need to
2155 * drop the inode lock on the directory and try again. xfs_iunlock will
2156 * potentially push the tail if we were holding up the log.
2157 */
2158STATIC int
2159xfs_lock_dir_and_entry(
2160	xfs_inode_t	*dp,
2161	xfs_inode_t	*ip)	/* inode of entry 'name' */
2162{
2163	int		attempts;
2164	xfs_ino_t	e_inum;
2165	xfs_inode_t	*ips[2];
2166	xfs_log_item_t	*lp;
2167
2168#ifdef DEBUG
2169	xfs_rm_locks++;
2170#endif
2171	attempts = 0;
2172
2173again:
2174	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2175
2176	e_inum = ip->i_ino;
2177
2178	ITRACE(ip);
2179
2180	/*
2181	 * We want to lock in increasing inum. Since we've already
2182	 * acquired the lock on the directory, we may need to release
2183	 * if if the inum of the entry turns out to be less.
2184	 */
2185	if (e_inum > dp->i_ino) {
2186		/*
2187		 * We are already in the right order, so just
2188		 * lock on the inode of the entry.
2189		 * We need to use nowait if dp is in the AIL.
2190		 */
2191
2192		lp = (xfs_log_item_t *)dp->i_itemp;
2193		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2194			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2195				attempts++;
2196#ifdef DEBUG
2197				xfs_rm_attempts++;
2198#endif
2199
2200				/*
2201				 * Unlock dp and try again.
2202				 * xfs_iunlock will try to push the tail
2203				 * if the inode is in the AIL.
2204				 */
2205
2206				xfs_iunlock(dp, XFS_ILOCK_EXCL);
2207
2208				if ((attempts % 5) == 0) {
2209					delay(1); /* Don't just spin the CPU */
2210#ifdef DEBUG
2211					xfs_rm_lock_delays++;
2212#endif
2213				}
2214				goto again;
2215			}
2216		} else {
2217			xfs_ilock(ip, XFS_ILOCK_EXCL);
2218		}
2219	} else if (e_inum < dp->i_ino) {
2220		xfs_iunlock(dp, XFS_ILOCK_EXCL);
2221
2222		ips[0] = ip;
2223		ips[1] = dp;
2224		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2225	}
2226	/* else	 e_inum == dp->i_ino */
2227	/*     This can happen if we're asked to lock /x/..
2228	 *     the entry is "..", which is also the parent directory.
2229	 */
2230
2231	return 0;
2232}
2233
2234#ifdef DEBUG
2235int xfs_locked_n;
2236int xfs_small_retries;
2237int xfs_middle_retries;
2238int xfs_lots_retries;
2239int xfs_lock_delays;
2240#endif
2241
2242/*
2243 * Bump the subclass so xfs_lock_inodes() acquires each lock with
2244 * a different value
2245 */
2246static inline int
2247xfs_lock_inumorder(int lock_mode, int subclass)
2248{
2249	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2250		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2251	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2252		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2253
2254	return lock_mode;
2255}
2256
2257/*
2258 * The following routine will lock n inodes in exclusive mode.
2259 * We assume the caller calls us with the inodes in i_ino order.
2260 *
2261 * We need to detect deadlock where an inode that we lock
2262 * is in the AIL and we start waiting for another inode that is locked
2263 * by a thread in a long running transaction (such as truncate). This can
2264 * result in deadlock since the long running trans might need to wait
2265 * for the inode we just locked in order to push the tail and free space
2266 * in the log.
2267 */
2268void
2269xfs_lock_inodes(
2270	xfs_inode_t	**ips,
2271	int		inodes,
2272	int		first_locked,
2273	uint		lock_mode)
2274{
2275	int		attempts = 0, i, j, try_lock;
2276	xfs_log_item_t	*lp;
2277
2278	ASSERT(ips && (inodes >= 2)); /* we need at least two */
2279
2280	if (first_locked) {
2281		try_lock = 1;
2282		i = 1;
2283	} else {
2284		try_lock = 0;
2285		i = 0;
2286	}
2287
2288again:
2289	for (; i < inodes; i++) {
2290		ASSERT(ips[i]);
2291
2292		if (i && (ips[i] == ips[i-1]))	/* Already locked */
2293			continue;
2294
2295		/*
2296		 * If try_lock is not set yet, make sure all locked inodes
2297		 * are not in the AIL.
2298		 * If any are, set try_lock to be used later.
2299		 */
2300
2301		if (!try_lock) {
2302			for (j = (i - 1); j >= 0 && !try_lock; j--) {
2303				lp = (xfs_log_item_t *)ips[j]->i_itemp;
2304				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2305					try_lock++;
2306				}
2307			}
2308		}
2309
2310		/*
2311		 * If any of the previous locks we have locked is in the AIL,
2312		 * we must TRY to get the second and subsequent locks. If
2313		 * we can't get any, we must release all we have
2314		 * and try again.
2315		 */
2316
2317		if (try_lock) {
2318			/* try_lock must be 0 if i is 0. */
2319			/*
2320			 * try_lock means we have an inode locked
2321			 * that is in the AIL.
2322			 */
2323			ASSERT(i != 0);
2324			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2325				attempts++;
2326
2327				/*
2328				 * Unlock all previous guys and try again.
2329				 * xfs_iunlock will try to push the tail
2330				 * if the inode is in the AIL.
2331				 */
2332
2333				for(j = i - 1; j >= 0; j--) {
2334
2335					/*
2336					 * Check to see if we've already
2337					 * unlocked this one.
2338					 * Not the first one going back,
2339					 * and the inode ptr is the same.
2340					 */
2341					if ((j != (i - 1)) && ips[j] ==
2342								ips[j+1])
2343						continue;
2344
2345					xfs_iunlock(ips[j], lock_mode);
2346				}
2347
2348				if ((attempts % 5) == 0) {
2349					delay(1); /* Don't just spin the CPU */
2350#ifdef DEBUG
2351					xfs_lock_delays++;
2352#endif
2353				}
2354				i = 0;
2355				try_lock = 0;
2356				goto again;
2357			}
2358		} else {
2359			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2360		}
2361	}
2362
2363#ifdef DEBUG
2364	if (attempts) {
2365		if (attempts < 5) xfs_small_retries++;
2366		else if (attempts < 100) xfs_middle_retries++;
2367		else xfs_lots_retries++;
2368	} else {
2369		xfs_locked_n++;
2370	}
2371#endif
2372}
2373
2374#ifdef	DEBUG
2375#define	REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
2376int remove_which_error_return = 0;
2377#else /* ! DEBUG */
2378#define	REMOVE_DEBUG_TRACE(x)
2379#endif	/* ! DEBUG */
2380
2381
2382/*
2383 * xfs_remove
2384 *
2385 */
2386STATIC int
2387xfs_remove(
2388	bhv_desc_t		*dir_bdp,
2389	bhv_vname_t		*dentry,
2390	cred_t			*credp)
2391{
2392	bhv_vnode_t		*dir_vp;
2393	char			*name = VNAME(dentry);
2394	xfs_inode_t             *dp, *ip;
2395	xfs_trans_t             *tp = NULL;
2396	xfs_mount_t		*mp;
2397	int                     error = 0;
2398	xfs_bmap_free_t         free_list;
2399	xfs_fsblock_t           first_block;
2400	int			cancel_flags;
2401	int			committed;
2402	int			dm_di_mode = 0;
2403	int			link_zero;
2404	uint			resblks;
2405	int			namelen;
2406
2407	dir_vp = BHV_TO_VNODE(dir_bdp);
2408	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2409
2410	dp = XFS_BHVTOI(dir_bdp);
2411	mp = dp->i_mount;
2412
2413	if (XFS_FORCED_SHUTDOWN(mp))
2414		return XFS_ERROR(EIO);
2415
2416	namelen = VNAMELEN(dentry);
2417
2418	if (!xfs_get_dir_entry(dentry, &ip)) {
2419	        dm_di_mode = ip->i_d.di_mode;
2420		IRELE(ip);
2421	}
2422
2423	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2424		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2425					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2426					name, NULL, dm_di_mode, 0, 0);
2427		if (error)
2428			return error;
2429	}
2430
2431	/* From this point on, return through std_return */
2432	ip = NULL;
2433
2434	/*
2435	 * We need to get a reference to ip before we get our log
2436	 * reservation. The reason for this is that we cannot call
2437	 * xfs_iget for an inode for which we do not have a reference
2438	 * once we've acquired a log reservation. This is because the
2439	 * inode we are trying to get might be in xfs_inactive going
2440	 * for a log reservation. Since we'll have to wait for the
2441	 * inactive code to complete before returning from xfs_iget,
2442	 * we need to make sure that we don't have log space reserved
2443	 * when we call xfs_iget.  Instead we get an unlocked reference
2444	 * to the inode before getting our log reservation.
2445	 */
2446	error = xfs_get_dir_entry(dentry, &ip);
2447	if (error) {
2448		REMOVE_DEBUG_TRACE(__LINE__);
2449		goto std_return;
2450	}
2451
2452	dm_di_mode = ip->i_d.di_mode;
2453
2454	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2455
2456	ITRACE(ip);
2457
2458	error = XFS_QM_DQATTACH(mp, dp, 0);
2459	if (!error && dp != ip)
2460		error = XFS_QM_DQATTACH(mp, ip, 0);
2461	if (error) {
2462		REMOVE_DEBUG_TRACE(__LINE__);
2463		IRELE(ip);
2464		goto std_return;
2465	}
2466
2467	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2468	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2469	/*
2470	 * We try to get the real space reservation first,
2471	 * allowing for directory btree deletion(s) implying
2472	 * possible bmap insert(s).  If we can't get the space
2473	 * reservation then we use 0 instead, and avoid the bmap
2474	 * btree insert(s) in the directory code by, if the bmap
2475	 * insert tries to happen, instead trimming the LAST
2476	 * block from the directory.
2477	 */
2478	resblks = XFS_REMOVE_SPACE_RES(mp);
2479	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2480			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2481	if (error == ENOSPC) {
2482		resblks = 0;
2483		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2484				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2485	}
2486	if (error) {
2487		ASSERT(error != ENOSPC);
2488		REMOVE_DEBUG_TRACE(__LINE__);
2489		xfs_trans_cancel(tp, 0);
2490		IRELE(ip);
2491		return error;
2492	}
2493
2494	error = xfs_lock_dir_and_entry(dp, ip);
2495	if (error) {
2496		REMOVE_DEBUG_TRACE(__LINE__);
2497		xfs_trans_cancel(tp, cancel_flags);
2498		IRELE(ip);
2499		goto std_return;
2500	}
2501
2502	/*
2503	 * At this point, we've gotten both the directory and the entry
2504	 * inodes locked.
2505	 */
2506	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2507	if (dp != ip) {
2508		/*
2509		 * Increment vnode ref count only in this case since
2510		 * there's an extra vnode reference in the case where
2511		 * dp == ip.
2512		 */
2513		IHOLD(dp);
2514		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2515	}
2516
2517	/*
2518	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2519	 */
2520	XFS_BMAP_INIT(&free_list, &first_block);
2521	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2522					&first_block, &free_list, 0);
2523	if (error) {
2524		ASSERT(error != ENOENT);
2525		REMOVE_DEBUG_TRACE(__LINE__);
2526		goto error1;
2527	}
2528	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2529
2530	dp->i_gen++;
2531	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2532
2533	error = xfs_droplink(tp, ip);
2534	if (error) {
2535		REMOVE_DEBUG_TRACE(__LINE__);
2536		goto error1;
2537	}
2538
2539	/* Determine if this is the last link while
2540	 * we are in the transaction.
2541	 */
2542	link_zero = (ip)->i_d.di_nlink==0;
2543
2544	/*
2545	 * Take an extra ref on the inode so that it doesn't
2546	 * go to xfs_inactive() from within the commit.
2547	 */
2548	IHOLD(ip);
2549
2550	/*
2551	 * If this is a synchronous mount, make sure that the
2552	 * remove transaction goes to disk before returning to
2553	 * the user.
2554	 */
2555	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2556		xfs_trans_set_sync(tp);
2557	}
2558
2559	error = xfs_bmap_finish(&tp, &free_list, &committed);
2560	if (error) {
2561		REMOVE_DEBUG_TRACE(__LINE__);
2562		goto error_rele;
2563	}
2564
2565	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2566	if (error) {
2567		IRELE(ip);
2568		goto std_return;
2569	}
2570
2571	/*
2572	 * Before we drop our extra reference to the inode, purge it
2573	 * from the refcache if it is there.  By waiting until afterwards
2574	 * to do the IRELE, we ensure that we won't go inactive in the
2575	 * xfs_refcache_purge_ip routine (although that would be OK).
2576	 */
2577	xfs_refcache_purge_ip(ip);
2578
2579	/*
2580	 * If we are using filestreams, kill the stream association.
2581	 * If the file is still open it may get a new one but that
2582	 * will get killed on last close in xfs_close() so we don't
2583	 * have to worry about that.
2584	 */
2585	if (link_zero && xfs_inode_is_filestream(ip))
2586		xfs_filestream_deassociate(ip);
2587
2588	vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2589
2590	/*
2591	 * Let interposed file systems know about removed links.
2592	 */
2593	bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2594
2595	IRELE(ip);
2596
2597/*	Fall through to std_return with error = 0 */
2598 std_return:
2599	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2600						DM_EVENT_POSTREMOVE)) {
2601		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2602				dir_vp, DM_RIGHT_NULL,
2603				NULL, DM_RIGHT_NULL,
2604				name, NULL, dm_di_mode, error, 0);
2605	}
2606	return error;
2607
2608 error1:
2609	xfs_bmap_cancel(&free_list);
2610	cancel_flags |= XFS_TRANS_ABORT;
2611	xfs_trans_cancel(tp, cancel_flags);
2612	goto std_return;
2613
2614 error_rele:
2615	/*
2616	 * In this case make sure to not release the inode until after
2617	 * the current transaction is aborted.  Releasing it beforehand
2618	 * can cause us to go to xfs_inactive and start a recursive
2619	 * transaction which can easily deadlock with the current one.
2620	 */
2621	xfs_bmap_cancel(&free_list);
2622	cancel_flags |= XFS_TRANS_ABORT;
2623	xfs_trans_cancel(tp, cancel_flags);
2624
2625	/*
2626	 * Before we drop our extra reference to the inode, purge it
2627	 * from the refcache if it is there.  By waiting until afterwards
2628	 * to do the IRELE, we ensure that we won't go inactive in the
2629	 * xfs_refcache_purge_ip routine (although that would be OK).
2630	 */
2631	xfs_refcache_purge_ip(ip);
2632
2633	IRELE(ip);
2634
2635	goto std_return;
2636}
2637
2638
2639/*
2640 * xfs_link
2641 *
2642 */
2643STATIC int
2644xfs_link(
2645	bhv_desc_t		*target_dir_bdp,
2646	bhv_vnode_t		*src_vp,
2647	bhv_vname_t		*dentry,
2648	cred_t			*credp)
2649{
2650	xfs_inode_t		*tdp, *sip;
2651	xfs_trans_t		*tp;
2652	xfs_mount_t		*mp;
2653	xfs_inode_t		*ips[2];
2654	int			error;
2655	xfs_bmap_free_t         free_list;
2656	xfs_fsblock_t           first_block;
2657	int			cancel_flags;
2658	int			committed;
2659	bhv_vnode_t		*target_dir_vp;
2660	int			resblks;
2661	char			*target_name = VNAME(dentry);
2662	int			target_namelen;
2663
2664	target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2665	vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2666	vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2667
2668	target_namelen = VNAMELEN(dentry);
2669	ASSERT(!VN_ISDIR(src_vp));
2670
2671	sip = xfs_vtoi(src_vp);
2672	tdp = XFS_BHVTOI(target_dir_bdp);
2673	mp = tdp->i_mount;
2674	if (XFS_FORCED_SHUTDOWN(mp))
2675		return XFS_ERROR(EIO);
2676
2677	if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2678		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2679					target_dir_vp, DM_RIGHT_NULL,
2680					src_vp, DM_RIGHT_NULL,
2681					target_name, NULL, 0, 0, 0);
2682		if (error)
2683			return error;
2684	}
2685
2686	/* Return through std_return after this point. */
2687
2688	error = XFS_QM_DQATTACH(mp, sip, 0);
2689	if (!error && sip != tdp)
2690		error = XFS_QM_DQATTACH(mp, tdp, 0);
2691	if (error)
2692		goto std_return;
2693
2694	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2695	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2696	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2697	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2698			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2699	if (error == ENOSPC) {
2700		resblks = 0;
2701		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2702				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2703	}
2704	if (error) {
2705		cancel_flags = 0;
2706		goto error_return;
2707	}
2708
2709	if (sip->i_ino < tdp->i_ino) {
2710		ips[0] = sip;
2711		ips[1] = tdp;
2712	} else {
2713		ips[0] = tdp;
2714		ips[1] = sip;
2715	}
2716
2717	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2718
2719	/*
2720	 * Increment vnode ref counts since xfs_trans_commit &
2721	 * xfs_trans_cancel will both unlock the inodes and
2722	 * decrement the associated ref counts.
2723	 */
2724	VN_HOLD(src_vp);
2725	VN_HOLD(target_dir_vp);
2726	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2727	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2728
2729	/*
2730	 * If the source has too many links, we can't make any more to it.
2731	 */
2732	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2733		error = XFS_ERROR(EMLINK);
2734		goto error_return;
2735	}
2736
2737	/*
2738	 * If we are using project inheritance, we only allow hard link
2739	 * creation in our tree when the project IDs are the same; else
2740	 * the tree quota mechanism could be circumvented.
2741	 */
2742	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2743		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2744		error = XFS_ERROR(EXDEV);
2745		goto error_return;
2746	}
2747
2748	if (resblks == 0 &&
2749	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2750		goto error_return;
2751
2752	XFS_BMAP_INIT(&free_list, &first_block);
2753
2754	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2755				   sip->i_ino, &first_block, &free_list,
2756				   resblks);
2757	if (error)
2758		goto abort_return;
2759	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2760	tdp->i_gen++;
2761	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2762
2763	error = xfs_bumplink(tp, sip);
2764	if (error)
2765		goto abort_return;
2766
2767	/*
2768	 * If this is a synchronous mount, make sure that the
2769	 * link transaction goes to disk before returning to
2770	 * the user.
2771	 */
2772	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2773		xfs_trans_set_sync(tp);
2774	}
2775
2776	error = xfs_bmap_finish (&tp, &free_list, &committed);
2777	if (error) {
2778		xfs_bmap_cancel(&free_list);
2779		goto abort_return;
2780	}
2781
2782	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2783	if (error)
2784		goto std_return;
2785
2786	/* Fall through to std_return with error = 0. */
2787std_return:
2788	if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2789						DM_EVENT_POSTLINK)) {
2790		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2791				target_dir_vp, DM_RIGHT_NULL,
2792				src_vp, DM_RIGHT_NULL,
2793				target_name, NULL, 0, error, 0);
2794	}
2795	return error;
2796
2797 abort_return:
2798	cancel_flags |= XFS_TRANS_ABORT;
2799	/* FALLTHROUGH */
2800
2801 error_return:
2802	xfs_trans_cancel(tp, cancel_flags);
2803	goto std_return;
2804}
2805
2806
2807/*
2808 * xfs_mkdir
2809 *
2810 */
2811STATIC int
2812xfs_mkdir(
2813	bhv_desc_t		*dir_bdp,
2814	bhv_vname_t		*dentry,
2815	bhv_vattr_t		*vap,
2816	bhv_vnode_t		**vpp,
2817	cred_t			*credp)
2818{
2819	char			*dir_name = VNAME(dentry);
2820	xfs_inode_t             *dp;
2821	xfs_inode_t		*cdp;	/* inode of created dir */
2822	bhv_vnode_t		*cvp;	/* vnode of created dir */
2823	xfs_trans_t		*tp;
2824	xfs_mount_t		*mp;
2825	int			cancel_flags;
2826	int			error;
2827	int			committed;
2828	xfs_bmap_free_t         free_list;
2829	xfs_fsblock_t           first_block;
2830	bhv_vnode_t		*dir_vp;
2831	boolean_t		dp_joined_to_trans;
2832	boolean_t		created = B_FALSE;
2833	int			dm_event_sent = 0;
2834	xfs_prid_t		prid;
2835	struct xfs_dquot	*udqp, *gdqp;
2836	uint			resblks;
2837	int			dm_di_mode;
2838	int			dir_namelen;
2839
2840	dir_vp = BHV_TO_VNODE(dir_bdp);
2841	dp = XFS_BHVTOI(dir_bdp);
2842	mp = dp->i_mount;
2843
2844	if (XFS_FORCED_SHUTDOWN(mp))
2845		return XFS_ERROR(EIO);
2846
2847	dir_namelen = VNAMELEN(dentry);
2848
2849	tp = NULL;
2850	dp_joined_to_trans = B_FALSE;
2851	dm_di_mode = vap->va_mode;
2852
2853	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2854		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2855					dir_vp, DM_RIGHT_NULL, NULL,
2856					DM_RIGHT_NULL, dir_name, NULL,
2857					dm_di_mode, 0, 0);
2858		if (error)
2859			return error;
2860		dm_event_sent = 1;
2861	}
2862
2863	/* Return through std_return after this point. */
2864
2865	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2866
2867	mp = dp->i_mount;
2868	udqp = gdqp = NULL;
2869	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2870		prid = dp->i_d.di_projid;
2871	else if (vap->va_mask & XFS_AT_PROJID)
2872		prid = (xfs_prid_t)vap->va_projid;
2873	else
2874		prid = (xfs_prid_t)dfltprid;
2875
2876	/*
2877	 * Make sure that we have allocated dquot(s) on disk.
2878	 */
2879	error = XFS_QM_DQVOPALLOC(mp, dp,
2880			current_fsuid(credp), current_fsgid(credp), prid,
2881			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2882	if (error)
2883		goto std_return;
2884
2885	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2886	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2887	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2888	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2889				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2890	if (error == ENOSPC) {
2891		resblks = 0;
2892		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2893					  XFS_TRANS_PERM_LOG_RES,
2894					  XFS_MKDIR_LOG_COUNT);
2895	}
2896	if (error) {
2897		cancel_flags = 0;
2898		dp = NULL;
2899		goto error_return;
2900	}
2901
2902	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2903
2904	/*
2905	 * Check for directory link count overflow.
2906	 */
2907	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2908		error = XFS_ERROR(EMLINK);
2909		goto error_return;
2910	}
2911
2912	/*
2913	 * Reserve disk quota and the inode.
2914	 */
2915	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2916	if (error)
2917		goto error_return;
2918
2919	if (resblks == 0 &&
2920	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2921		goto error_return;
2922	/*
2923	 * create the directory inode.
2924	 */
2925	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2926			0, credp, prid, resblks > 0,
2927		&cdp, NULL);
2928	if (error) {
2929		if (error == ENOSPC)
2930			goto error_return;
2931		goto abort_return;
2932	}
2933	ITRACE(cdp);
2934
2935	/*
2936	 * Now we add the directory inode to the transaction.
2937	 * We waited until now since xfs_dir_ialloc might start
2938	 * a new transaction.  Had we joined the transaction
2939	 * earlier, the locks might have gotten released.
2940	 */
2941	VN_HOLD(dir_vp);
2942	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2943	dp_joined_to_trans = B_TRUE;
2944
2945	XFS_BMAP_INIT(&free_list, &first_block);
2946
2947	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2948				   &first_block, &free_list, resblks ?
2949				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2950	if (error) {
2951		ASSERT(error != ENOSPC);
2952		goto error1;
2953	}
2954	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2955
2956	/*
2957	 * Bump the in memory version number of the parent directory
2958	 * so that other processes accessing it will recognize that
2959	 * the directory has changed.
2960	 */
2961	dp->i_gen++;
2962
2963	error = xfs_dir_init(tp, cdp, dp);
2964	if (error)
2965		goto error2;
2966
2967	cdp->i_gen = 1;
2968	error = xfs_bumplink(tp, dp);
2969	if (error)
2970		goto error2;
2971
2972	cvp = XFS_ITOV(cdp);
2973
2974	created = B_TRUE;
2975
2976	*vpp = cvp;
2977	IHOLD(cdp);
2978
2979	/*
2980	 * Attach the dquots to the new inode and modify the icount incore.
2981	 */
2982	XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2983
2984	/*
2985	 * If this is a synchronous mount, make sure that the
2986	 * mkdir transaction goes to disk before returning to
2987	 * the user.
2988	 */
2989	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2990		xfs_trans_set_sync(tp);
2991	}
2992
2993	error = xfs_bmap_finish(&tp, &free_list, &committed);
2994	if (error) {
2995		IRELE(cdp);
2996		goto error2;
2997	}
2998
2999	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3000	XFS_QM_DQRELE(mp, udqp);
3001	XFS_QM_DQRELE(mp, gdqp);
3002	if (error) {
3003		IRELE(cdp);
3004	}
3005
3006	/* Fall through to std_return with error = 0 or errno from
3007	 * xfs_trans_commit. */
3008
3009std_return:
3010	if ( (created || (error != 0 && dm_event_sent != 0)) &&
3011			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3012						DM_EVENT_POSTCREATE)) {
3013		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
3014					dir_vp, DM_RIGHT_NULL,
3015					created ? XFS_ITOV(cdp):NULL,
3016					DM_RIGHT_NULL,
3017					dir_name, NULL,
3018					dm_di_mode, error, 0);
3019	}
3020	return error;
3021
3022 error2:
3023 error1:
3024	xfs_bmap_cancel(&free_list);
3025 abort_return:
3026	cancel_flags |= XFS_TRANS_ABORT;
3027 error_return:
3028	xfs_trans_cancel(tp, cancel_flags);
3029	XFS_QM_DQRELE(mp, udqp);
3030	XFS_QM_DQRELE(mp, gdqp);
3031
3032	if (!dp_joined_to_trans && (dp != NULL)) {
3033		xfs_iunlock(dp, XFS_ILOCK_EXCL);
3034	}
3035
3036	goto std_return;
3037}
3038
3039
3040/*
3041 * xfs_rmdir
3042 *
3043 */
3044STATIC int
3045xfs_rmdir(
3046	bhv_desc_t		*dir_bdp,
3047	bhv_vname_t		*dentry,
3048	cred_t			*credp)
3049{
3050	char			*name = VNAME(dentry);
3051	xfs_inode_t             *dp;
3052	xfs_inode_t             *cdp;   /* child directory */
3053	xfs_trans_t             *tp;
3054	xfs_mount_t		*mp;
3055	int                     error;
3056	xfs_bmap_free_t         free_list;
3057	xfs_fsblock_t           first_block;
3058	int			cancel_flags;
3059	int			committed;
3060	bhv_vnode_t		*dir_vp;
3061	int			dm_di_mode = S_IFDIR;
3062	int			last_cdp_link;
3063	int			namelen;
3064	uint			resblks;
3065
3066	dir_vp = BHV_TO_VNODE(dir_bdp);
3067	dp = XFS_BHVTOI(dir_bdp);
3068	mp = dp->i_mount;
3069
3070	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3071
3072	if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3073		return XFS_ERROR(EIO);
3074	namelen = VNAMELEN(dentry);
3075
3076	if (!xfs_get_dir_entry(dentry, &cdp)) {
3077	        dm_di_mode = cdp->i_d.di_mode;
3078		IRELE(cdp);
3079	}
3080
3081	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3082		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3083					dir_vp, DM_RIGHT_NULL,
3084					NULL, DM_RIGHT_NULL,
3085					name, NULL, dm_di_mode, 0, 0);
3086		if (error)
3087			return XFS_ERROR(error);
3088	}
3089
3090	/* Return through std_return after this point. */
3091
3092	cdp = NULL;
3093
3094	/*
3095	 * We need to get a reference to cdp before we get our log
3096	 * reservation.  The reason for this is that we cannot call
3097	 * xfs_iget for an inode for which we do not have a reference
3098	 * once we've acquired a log reservation.  This is because the
3099	 * inode we are trying to get might be in xfs_inactive going
3100	 * for a log reservation.  Since we'll have to wait for the
3101	 * inactive code to complete before returning from xfs_iget,
3102	 * we need to make sure that we don't have log space reserved
3103	 * when we call xfs_iget.  Instead we get an unlocked reference
3104	 * to the inode before getting our log reservation.
3105	 */
3106	error = xfs_get_dir_entry(dentry, &cdp);
3107	if (error) {
3108		REMOVE_DEBUG_TRACE(__LINE__);
3109		goto std_return;
3110	}
3111	mp = dp->i_mount;
3112	dm_di_mode = cdp->i_d.di_mode;
3113
3114	/*
3115	 * Get the dquots for the inodes.
3116	 */
3117	error = XFS_QM_DQATTACH(mp, dp, 0);
3118	if (!error && dp != cdp)
3119		error = XFS_QM_DQATTACH(mp, cdp, 0);
3120	if (error) {
3121		IRELE(cdp);
3122		REMOVE_DEBUG_TRACE(__LINE__);
3123		goto std_return;
3124	}
3125
3126	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3127	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3128	/*
3129	 * We try to get the real space reservation first,
3130	 * allowing for directory btree deletion(s) implying
3131	 * possible bmap insert(s).  If we can't get the space
3132	 * reservation then we use 0 instead, and avoid the bmap
3133	 * btree insert(s) in the directory code by, if the bmap
3134	 * insert tries to happen, instead trimming the LAST
3135	 * block from the directory.
3136	 */
3137	resblks = XFS_REMOVE_SPACE_RES(mp);
3138	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3139			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3140	if (error == ENOSPC) {
3141		resblks = 0;
3142		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3143				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3144	}
3145	if (error) {
3146		ASSERT(error != ENOSPC);
3147		cancel_flags = 0;
3148		IRELE(cdp);
3149		goto error_return;
3150	}
3151	XFS_BMAP_INIT(&free_list, &first_block);
3152
3153	/*
3154	 * Now lock the child directory inode and the parent directory
3155	 * inode in the proper order.  This will take care of validating
3156	 * that the directory entry for the child directory inode has
3157	 * not changed while we were obtaining a log reservation.
3158	 */
3159	error = xfs_lock_dir_and_entry(dp, cdp);
3160	if (error) {
3161		xfs_trans_cancel(tp, cancel_flags);
3162		IRELE(cdp);
3163		goto std_return;
3164	}
3165
3166	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3167	if (dp != cdp) {
3168		/*
3169		 * Only increment the parent directory vnode count if
3170		 * we didn't bump it in looking up cdp.  The only time
3171		 * we don't bump it is when we're looking up ".".
3172		 */
3173		VN_HOLD(dir_vp);
3174	}
3175
3176	ITRACE(cdp);
3177	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3178
3179	ASSERT(cdp->i_d.di_nlink >= 2);
3180	if (cdp->i_d.di_nlink != 2) {
3181		error = XFS_ERROR(ENOTEMPTY);
3182		goto error_return;
3183	}
3184	if (!xfs_dir_isempty(cdp)) {
3185		error = XFS_ERROR(ENOTEMPTY);
3186		goto error_return;
3187	}
3188
3189	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3190					&first_block, &free_list, resblks);
3191	if (error)
3192		goto error1;
3193
3194	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3195
3196	/*
3197	 * Bump the in memory generation count on the parent
3198	 * directory so that other can know that it has changed.
3199	 */
3200	dp->i_gen++;
3201
3202	/*
3203	 * Drop the link from cdp's "..".
3204	 */
3205	error = xfs_droplink(tp, dp);
3206	if (error) {
3207		goto error1;
3208	}
3209
3210	/*
3211	 * Drop the link from dp to cdp.
3212	 */
3213	error = xfs_droplink(tp, cdp);
3214	if (error) {
3215		goto error1;
3216	}
3217
3218	/*
3219	 * Drop the "." link from cdp to self.
3220	 */
3221	error = xfs_droplink(tp, cdp);
3222	if (error) {
3223		goto error1;
3224	}
3225
3226	/* Determine these before committing transaction */
3227	last_cdp_link = (cdp)->i_d.di_nlink==0;
3228
3229	/*
3230	 * Take an extra ref on the child vnode so that it
3231	 * does not go to xfs_inactive() from within the commit.
3232	 */
3233	IHOLD(cdp);
3234
3235	/*
3236	 * If this is a synchronous mount, make sure that the
3237	 * rmdir transaction goes to disk before returning to
3238	 * the user.
3239	 */
3240	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3241		xfs_trans_set_sync(tp);
3242	}
3243
3244	error = xfs_bmap_finish (&tp, &free_list, &committed);
3245	if (error) {
3246		xfs_bmap_cancel(&free_list);
3247		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3248				 XFS_TRANS_ABORT));
3249		IRELE(cdp);
3250		goto std_return;
3251	}
3252
3253	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3254	if (error) {
3255		IRELE(cdp);
3256		goto std_return;
3257	}
3258
3259
3260	/*
3261	 * Let interposed file systems know about removed links.
3262	 */
3263	bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3264
3265	IRELE(cdp);
3266
3267	/* Fall through to std_return with error = 0 or the errno
3268	 * from xfs_trans_commit. */
3269 std_return:
3270	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3271		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3272					dir_vp, DM_RIGHT_NULL,
3273					NULL, DM_RIGHT_NULL,
3274					name, NULL, dm_di_mode,
3275					error, 0);
3276	}
3277	return error;
3278
3279 error1:
3280	xfs_bmap_cancel(&free_list);
3281	cancel_flags |= XFS_TRANS_ABORT;
3282	/* FALLTHROUGH */
3283
3284 error_return:
3285	xfs_trans_cancel(tp, cancel_flags);
3286	goto std_return;
3287}
3288
3289
3290/*
3291 * Read dp's entries starting at uiop->uio_offset and translate them into
3292 * bufsize bytes worth of struct dirents starting at bufbase.
3293 */
3294STATIC int
3295xfs_readdir(
3296	bhv_desc_t	*dir_bdp,
3297	uio_t		*uiop,
3298	cred_t		*credp,
3299	int		*eofp)
3300{
3301	xfs_inode_t	*dp;
3302	xfs_trans_t	*tp = NULL;
3303	int		error = 0;
3304	uint		lock_mode;
3305
3306	vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3307					       (inst_t *)__return_address);
3308	dp = XFS_BHVTOI(dir_bdp);
3309
3310	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
3311		return XFS_ERROR(EIO);
3312
3313	lock_mode = xfs_ilock_map_shared(dp);
3314	error = xfs_dir_getdents(tp, dp, uiop, eofp);
3315	xfs_iunlock_map_shared(dp, lock_mode);
3316	return error;
3317}
3318
3319
3320STATIC int
3321xfs_symlink(
3322	bhv_desc_t		*dir_bdp,
3323	bhv_vname_t		*dentry,
3324	bhv_vattr_t		*vap,
3325	char			*target_path,
3326	bhv_vnode_t		**vpp,
3327	cred_t			*credp)
3328{
3329	xfs_trans_t		*tp;
3330	xfs_mount_t		*mp;
3331	xfs_inode_t		*dp;
3332	xfs_inode_t		*ip;
3333	int			error;
3334	int			pathlen;
3335	xfs_bmap_free_t		free_list;
3336	xfs_fsblock_t		first_block;
3337	boolean_t		dp_joined_to_trans;
3338	bhv_vnode_t		*dir_vp;
3339	uint			cancel_flags;
3340	int			committed;
3341	xfs_fileoff_t		first_fsb;
3342	xfs_filblks_t		fs_blocks;
3343	int			nmaps;
3344	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
3345	xfs_daddr_t		d;
3346	char			*cur_chunk;
3347	int			byte_cnt;
3348	int			n;
3349	xfs_buf_t		*bp;
3350	xfs_prid_t		prid;
3351	struct xfs_dquot	*udqp, *gdqp;
3352	uint			resblks;
3353	char			*link_name = VNAME(dentry);
3354	int			link_namelen;
3355
3356	*vpp = NULL;
3357	dir_vp = BHV_TO_VNODE(dir_bdp);
3358	dp = XFS_BHVTOI(dir_bdp);
3359	dp_joined_to_trans = B_FALSE;
3360	error = 0;
3361	ip = NULL;
3362	tp = NULL;
3363
3364	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3365
3366	mp = dp->i_mount;
3367
3368	if (XFS_FORCED_SHUTDOWN(mp))
3369		return XFS_ERROR(EIO);
3370
3371	link_namelen = VNAMELEN(dentry);
3372
3373	/*
3374	 * Check component lengths of the target path name.
3375	 */
3376	pathlen = strlen(target_path);
3377	if (pathlen >= MAXPATHLEN)      /* total string too long */
3378		return XFS_ERROR(ENAMETOOLONG);
3379	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3380		int len, total;
3381		char *path;
3382
3383		for (total = 0, path = target_path; total < pathlen;) {
3384			/*
3385			 * Skip any slashes.
3386			 */
3387			while(*path == '/') {
3388				total++;
3389				path++;
3390			}
3391
3392			/*
3393			 * Count up to the next slash or end of path.
3394			 * Error out if the component is bigger than MAXNAMELEN.
3395			 */
3396			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3397				if (++len >= MAXNAMELEN) {
3398					error = ENAMETOOLONG;
3399					return error;
3400				}
3401			}
3402		}
3403	}
3404
3405	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3406		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3407					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3408					link_name, target_path, 0, 0, 0);
3409		if (error)
3410			return error;
3411	}
3412
3413	/* Return through std_return after this point. */
3414
3415	udqp = gdqp = NULL;
3416	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3417		prid = dp->i_d.di_projid;
3418	else if (vap->va_mask & XFS_AT_PROJID)
3419		prid = (xfs_prid_t)vap->va_projid;
3420	else
3421		prid = (xfs_prid_t)dfltprid;
3422
3423	/*
3424	 * Make sure that we have allocated dquot(s) on disk.
3425	 */
3426	error = XFS_QM_DQVOPALLOC(mp, dp,
3427			current_fsuid(credp), current_fsgid(credp), prid,
3428			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3429	if (error)
3430		goto std_return;
3431
3432	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3433	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3434	/*
3435	 * The symlink will fit into the inode data fork?
3436	 * There can't be any attributes so we get the whole variable part.
3437	 */
3438	if (pathlen <= XFS_LITINO(mp))
3439		fs_blocks = 0;
3440	else
3441		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3442	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3443	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3444			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3445	if (error == ENOSPC && fs_blocks == 0) {
3446		resblks = 0;
3447		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3448				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3449	}
3450	if (error) {
3451		cancel_flags = 0;
3452		dp = NULL;
3453		goto error_return;
3454	}
3455
3456	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3457
3458	/*
3459	 * Check whether the directory allows new symlinks or not.
3460	 */
3461	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3462		error = XFS_ERROR(EPERM);
3463		goto error_return;
3464	}
3465
3466	/*
3467	 * Reserve disk quota : blocks and inode.
3468	 */
3469	error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3470	if (error)
3471		goto error_return;
3472
3473	/*
3474	 * Check for ability to enter directory entry, if no space reserved.
3475	 */
3476	if (resblks == 0 &&
3477	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3478		goto error_return;
3479	/*
3480	 * Initialize the bmap freelist prior to calling either
3481	 * bmapi or the directory create code.
3482	 */
3483	XFS_BMAP_INIT(&free_list, &first_block);
3484
3485	/*
3486	 * Allocate an inode for the symlink.
3487	 */
3488	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3489			       1, 0, credp, prid, resblks > 0, &ip, NULL);
3490	if (error) {
3491		if (error == ENOSPC)
3492			goto error_return;
3493		goto error1;
3494	}
3495	ITRACE(ip);
3496
3497	VN_HOLD(dir_vp);
3498	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3499	dp_joined_to_trans = B_TRUE;
3500
3501	/*
3502	 * Also attach the dquot(s) to it, if applicable.
3503	 */
3504	XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3505
3506	if (resblks)
3507		resblks -= XFS_IALLOC_SPACE_RES(mp);
3508	/*
3509	 * If the symlink will fit into the inode, write it inline.
3510	 */
3511	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3512		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3513		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3514		ip->i_d.di_size = pathlen;
3515
3516		/*
3517		 * The inode was initially created in extent format.
3518		 */
3519		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3520		ip->i_df.if_flags |= XFS_IFINLINE;
3521
3522		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3523		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3524
3525	} else {
3526		first_fsb = 0;
3527		nmaps = SYMLINK_MAPS;
3528
3529		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3530				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3531				  &first_block, resblks, mval, &nmaps,
3532				  &free_list, NULL);
3533		if (error) {
3534			goto error1;
3535		}
3536
3537		if (resblks)
3538			resblks -= fs_blocks;
3539		ip->i_d.di_size = pathlen;
3540		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3541
3542		cur_chunk = target_path;
3543		for (n = 0; n < nmaps; n++) {
3544			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3545			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3546			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3547					       BTOBB(byte_cnt), 0);
3548			ASSERT(bp && !XFS_BUF_GETERROR(bp));
3549			if (pathlen < byte_cnt) {
3550				byte_cnt = pathlen;
3551			}
3552			pathlen -= byte_cnt;
3553
3554			memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3555			cur_chunk += byte_cnt;
3556
3557			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3558		}
3559	}
3560
3561	/*
3562	 * Create the directory entry for the symlink.
3563	 */
3564	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3565				   &first_block, &free_list, resblks);
3566	if (error)
3567		goto error1;
3568	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3569	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3570
3571	/*
3572	 * Bump the in memory version number of the parent directory
3573	 * so that other processes accessing it will recognize that
3574	 * the directory has changed.
3575	 */
3576	dp->i_gen++;
3577
3578	/*
3579	 * If this is a synchronous mount, make sure that the
3580	 * symlink transaction goes to disk before returning to
3581	 * the user.
3582	 */
3583	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3584		xfs_trans_set_sync(tp);
3585	}
3586
3587	/*
3588	 * xfs_trans_commit normally decrements the vnode ref count
3589	 * when it unlocks the inode. Since we want to return the
3590	 * vnode to the caller, we bump the vnode ref count now.
3591	 */
3592	IHOLD(ip);
3593
3594	error = xfs_bmap_finish(&tp, &free_list, &committed);
3595	if (error) {
3596		goto error2;
3597	}
3598	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3599	XFS_QM_DQRELE(mp, udqp);
3600	XFS_QM_DQRELE(mp, gdqp);
3601
3602	/* Fall through to std_return with error = 0 or errno from
3603	 * xfs_trans_commit	*/
3604std_return:
3605	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3606			     DM_EVENT_POSTSYMLINK)) {
3607		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3608					dir_vp, DM_RIGHT_NULL,
3609					error ? NULL : XFS_ITOV(ip),
3610					DM_RIGHT_NULL, link_name, target_path,
3611					0, error, 0);
3612	}
3613
3614	if (!error) {
3615		bhv_vnode_t *vp;
3616
3617		ASSERT(ip);
3618		vp = XFS_ITOV(ip);
3619		*vpp = vp;
3620	}
3621	return error;
3622
3623 error2:
3624	IRELE(ip);
3625 error1:
3626	xfs_bmap_cancel(&free_list);
3627	cancel_flags |= XFS_TRANS_ABORT;
3628 error_return:
3629	xfs_trans_cancel(tp, cancel_flags);
3630	XFS_QM_DQRELE(mp, udqp);
3631	XFS_QM_DQRELE(mp, gdqp);
3632
3633	if (!dp_joined_to_trans && (dp != NULL)) {
3634		xfs_iunlock(dp, XFS_ILOCK_EXCL);
3635	}
3636
3637	goto std_return;
3638}
3639
3640
3641/*
3642 * xfs_fid2
3643 *
3644 * A fid routine that takes a pointer to a previously allocated
3645 * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3646 */
3647STATIC int
3648xfs_fid2(
3649	bhv_desc_t	*bdp,
3650	fid_t		*fidp)
3651{
3652	xfs_inode_t	*ip;
3653	xfs_fid2_t	*xfid;
3654
3655	vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3656				       (inst_t *)__return_address);
3657	ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3658
3659	xfid = (xfs_fid2_t *)fidp;
3660	ip = XFS_BHVTOI(bdp);
3661	xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3662	xfid->fid_pad = 0;
3663	/*
3664	 * use memcpy because the inode is a long long and there's no
3665	 * assurance that xfid->fid_ino is properly aligned.
3666	 */
3667	memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3668	xfid->fid_gen = ip->i_d.di_gen;
3669
3670	return 0;
3671}
3672
3673
3674/*
3675 * xfs_rwlock
3676 */
3677int
3678xfs_rwlock(
3679	bhv_desc_t	*bdp,
3680	bhv_vrwlock_t	locktype)
3681{
3682	xfs_inode_t	*ip;
3683	bhv_vnode_t	*vp;
3684
3685	vp = BHV_TO_VNODE(bdp);
3686	if (VN_ISDIR(vp))
3687		return 1;
3688	ip = XFS_BHVTOI(bdp);
3689	if (locktype == VRWLOCK_WRITE) {
3690		xfs_ilock(ip, XFS_IOLOCK_EXCL);
3691	} else if (locktype == VRWLOCK_TRY_READ) {
3692		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3693	} else if (locktype == VRWLOCK_TRY_WRITE) {
3694		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3695	} else {
3696		ASSERT((locktype == VRWLOCK_READ) ||
3697		       (locktype == VRWLOCK_WRITE_DIRECT));
3698		xfs_ilock(ip, XFS_IOLOCK_SHARED);
3699	}
3700
3701	return 1;
3702}
3703
3704
3705/*
3706 * xfs_rwunlock
3707 */
3708void
3709xfs_rwunlock(
3710	bhv_desc_t	*bdp,
3711	bhv_vrwlock_t	locktype)
3712{
3713	xfs_inode_t     *ip;
3714	bhv_vnode_t	*vp;
3715
3716	vp = BHV_TO_VNODE(bdp);
3717	if (VN_ISDIR(vp))
3718		return;
3719	ip = XFS_BHVTOI(bdp);
3720	if (locktype == VRWLOCK_WRITE) {
3721		/*
3722		 * In the write case, we may have added a new entry to
3723		 * the reference cache.  This might store a pointer to
3724		 * an inode to be released in this inode.  If it is there,
3725		 * clear the pointer and release the inode after unlocking
3726		 * this one.
3727		 */
3728		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3729	} else {
3730		ASSERT((locktype == VRWLOCK_READ) ||
3731		       (locktype == VRWLOCK_WRITE_DIRECT));
3732		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3733	}
3734	return;
3735}
3736
3737STATIC int
3738xfs_inode_flush(
3739	bhv_desc_t	*bdp,
3740	int		flags)
3741{
3742	xfs_inode_t	*ip;
3743	xfs_mount_t	*mp;
3744	xfs_inode_log_item_t *iip;
3745	int		error = 0;
3746
3747	ip = XFS_BHVTOI(bdp);
3748	mp = ip->i_mount;
3749	iip = ip->i_itemp;
3750
3751	if (XFS_FORCED_SHUTDOWN(mp))
3752		return XFS_ERROR(EIO);
3753
3754	/*
3755	 * Bypass inodes which have already been cleaned by
3756	 * the inode flush clustering code inside xfs_iflush
3757	 */
3758	if ((ip->i_update_core == 0) &&
3759	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3760		return 0;
3761
3762	if (flags & FLUSH_LOG) {
3763		if (iip && iip->ili_last_lsn) {
3764			xlog_t		*log = mp->m_log;
3765			xfs_lsn_t	sync_lsn;
3766			int		s, log_flags = XFS_LOG_FORCE;
3767
3768			s = GRANT_LOCK(log);
3769			sync_lsn = log->l_last_sync_lsn;
3770			GRANT_UNLOCK(log, s);
3771
3772			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3773				return 0;
3774
3775			if (flags & FLUSH_SYNC)
3776				log_flags |= XFS_LOG_SYNC;
3777			return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3778		}
3779	}
3780
3781	/*
3782	 * We make this non-blocking if the inode is contended,
3783	 * return EAGAIN to indicate to the caller that they
3784	 * did not succeed. This prevents the flush path from
3785	 * blocking on inodes inside another operation right
3786	 * now, they get caught later by xfs_sync.
3787	 */
3788	if (flags & FLUSH_INODE) {
3789		int	flush_flags;
3790
3791		if (xfs_ipincount(ip))
3792			return EAGAIN;
3793
3794		if (flags & FLUSH_SYNC) {
3795			xfs_ilock(ip, XFS_ILOCK_SHARED);
3796			xfs_iflock(ip);
3797		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3798			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3799				xfs_iunlock(ip, XFS_ILOCK_SHARED);
3800				return EAGAIN;
3801			}
3802		} else {
3803			return EAGAIN;
3804		}
3805
3806		if (flags & FLUSH_SYNC)
3807			flush_flags = XFS_IFLUSH_SYNC;
3808		else
3809			flush_flags = XFS_IFLUSH_ASYNC;
3810
3811		error = xfs_iflush(ip, flush_flags);
3812		xfs_iunlock(ip, XFS_ILOCK_SHARED);
3813	}
3814
3815	return error;
3816}
3817
3818int
3819xfs_set_dmattrs (
3820	bhv_desc_t	*bdp,
3821	u_int		evmask,
3822	u_int16_t	state,
3823	cred_t		*credp)
3824{
3825	xfs_inode_t     *ip;
3826	xfs_trans_t	*tp;
3827	xfs_mount_t	*mp;
3828	int		error;
3829
3830	if (!capable(CAP_SYS_ADMIN))
3831		return XFS_ERROR(EPERM);
3832
3833	ip = XFS_BHVTOI(bdp);
3834	mp = ip->i_mount;
3835
3836	if (XFS_FORCED_SHUTDOWN(mp))
3837		return XFS_ERROR(EIO);
3838
3839	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3840	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3841	if (error) {
3842		xfs_trans_cancel(tp, 0);
3843		return error;
3844	}
3845	xfs_ilock(ip, XFS_ILOCK_EXCL);
3846	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3847
3848	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3849	ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3850
3851	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3852	IHOLD(ip);
3853	error = xfs_trans_commit(tp, 0);
3854
3855	return error;
3856}
3857
3858STATIC int
3859xfs_reclaim(
3860	bhv_desc_t	*bdp)
3861{
3862	xfs_inode_t	*ip;
3863	bhv_vnode_t	*vp;
3864
3865	vp = BHV_TO_VNODE(bdp);
3866	ip = XFS_BHVTOI(bdp);
3867
3868	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3869
3870	ASSERT(!VN_MAPPED(vp));
3871
3872	/* bad inode, get out here ASAP */
3873	if (VN_BAD(vp)) {
3874		xfs_ireclaim(ip);
3875		return 0;
3876	}
3877
3878	vn_iowait(vp);
3879
3880	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3881
3882	/*
3883	 * Make sure the atime in the XFS inode is correct before freeing the
3884	 * Linux inode.
3885	 */
3886	xfs_synchronize_atime(ip);
3887
3888	/*
3889	 * If we have nothing to flush with this inode then complete the
3890	 * teardown now, otherwise break the link between the xfs inode and the
3891	 * linux inode and clean up the xfs inode later. This avoids flushing
3892	 * the inode to disk during the delete operation itself.
3893	 *
3894	 * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3895	 * first to ensure that xfs_iunpin() will never see an xfs inode
3896	 * that has a linux inode being reclaimed. Synchronisation is provided
3897	 * by the i_flags_lock.
3898	 */
3899	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3900		xfs_ilock(ip, XFS_ILOCK_EXCL);
3901		xfs_iflock(ip);
3902		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3903	} else {
3904		xfs_mount_t	*mp = ip->i_mount;
3905
3906		/* Protect sync and unpin from us */
3907		XFS_MOUNT_ILOCK(mp);
3908		spin_lock(&ip->i_flags_lock);
3909		__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3910		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3911		spin_unlock(&ip->i_flags_lock);
3912		list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3913		XFS_MOUNT_IUNLOCK(mp);
3914	}
3915	return 0;
3916}
3917
3918int
3919xfs_finish_reclaim(
3920	xfs_inode_t	*ip,
3921	int		locked,
3922	int		sync_mode)
3923{
3924	xfs_ihash_t	*ih = ip->i_hash;
3925	bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
3926	int		error;
3927
3928	if (vp && VN_BAD(vp))
3929		goto reclaim;
3930
3931	/* The hash lock here protects a thread in xfs_iget_core from
3932	 * racing with us on linking the inode back with a vnode.
3933	 * Once we have the XFS_IRECLAIM flag set it will not touch
3934	 * us.
3935	 */
3936	write_lock(&ih->ih_lock);
3937	spin_lock(&ip->i_flags_lock);
3938	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3939	    (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3940		spin_unlock(&ip->i_flags_lock);
3941		write_unlock(&ih->ih_lock);
3942		if (locked) {
3943			xfs_ifunlock(ip);
3944			xfs_iunlock(ip, XFS_ILOCK_EXCL);
3945		}
3946		return 1;
3947	}
3948	__xfs_iflags_set(ip, XFS_IRECLAIM);
3949	spin_unlock(&ip->i_flags_lock);
3950	write_unlock(&ih->ih_lock);
3951
3952	/*
3953	 * If the inode is still dirty, then flush it out.  If the inode
3954	 * is not in the AIL, then it will be OK to flush it delwri as
3955	 * long as xfs_iflush() does not keep any references to the inode.
3956	 * We leave that decision up to xfs_iflush() since it has the
3957	 * knowledge of whether it's OK to simply do a delwri flush of
3958	 * the inode or whether we need to wait until the inode is
3959	 * pulled from the AIL.
3960	 * We get the flush lock regardless, though, just to make sure
3961	 * we don't free it while it is being flushed.
3962	 */
3963	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3964		if (!locked) {
3965			xfs_ilock(ip, XFS_ILOCK_EXCL);
3966			xfs_iflock(ip);
3967		}
3968
3969		if (ip->i_update_core ||
3970		    ((ip->i_itemp != NULL) &&
3971		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
3972			error = xfs_iflush(ip, sync_mode);
3973			/*
3974			 * If we hit an error, typically because of filesystem
3975			 * shutdown, we don't need to let vn_reclaim to know
3976			 * because we're gonna reclaim the inode anyway.
3977			 */
3978			if (error) {
3979				xfs_iunlock(ip, XFS_ILOCK_EXCL);
3980				goto reclaim;
3981			}
3982			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3983		}
3984
3985		ASSERT(ip->i_update_core == 0);
3986		ASSERT(ip->i_itemp == NULL ||
3987		       ip->i_itemp->ili_format.ilf_fields == 0);
3988		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3989	} else if (locked) {
3990		/*
3991		 * We are not interested in doing an iflush if we're
3992		 * in the process of shutting down the filesystem forcibly.
3993		 * So, just reclaim the inode.
3994		 */
3995		xfs_ifunlock(ip);
3996		xfs_iunlock(ip, XFS_ILOCK_EXCL);
3997	}
3998
3999 reclaim:
4000	xfs_ireclaim(ip);
4001	return 0;
4002}
4003
4004int
4005xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
4006{
4007	int		purged;
4008	xfs_inode_t	*ip, *n;
4009	int		done = 0;
4010
4011	while (!done) {
4012		purged = 0;
4013		XFS_MOUNT_ILOCK(mp);
4014		list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
4015			if (noblock) {
4016				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
4017					continue;
4018				if (xfs_ipincount(ip) ||
4019				    !xfs_iflock_nowait(ip)) {
4020					xfs_iunlock(ip, XFS_ILOCK_EXCL);
4021					continue;
4022				}
4023			}
4024			XFS_MOUNT_IUNLOCK(mp);
4025			if (xfs_finish_reclaim(ip, noblock,
4026					XFS_IFLUSH_DELWRI_ELSE_ASYNC))
4027				delay(1);
4028			purged = 1;
4029			break;
4030		}
4031
4032		done = !purged;
4033	}
4034
4035	XFS_MOUNT_IUNLOCK(mp);
4036	return 0;
4037}
4038
4039/*
4040 * xfs_alloc_file_space()
4041 *      This routine allocates disk space for the given file.
4042 *
4043 *	If alloc_type == 0, this request is for an ALLOCSP type
4044 *	request which will change the file size.  In this case, no
4045 *	DMAPI event will be generated by the call.  A TRUNCATE event
4046 *	will be generated later by xfs_setattr.
4047 *
4048 *	If alloc_type != 0, this request is for a RESVSP type
4049 *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
4050 *	lower block boundary byte address is less than the file's
4051 *	length.
4052 *
4053 * RETURNS:
4054 *       0 on success
4055 *      errno on error
4056 *
4057 */
4058STATIC int
4059xfs_alloc_file_space(
4060	xfs_inode_t		*ip,
4061	xfs_off_t		offset,
4062	xfs_off_t		len,
4063	int			alloc_type,
4064	int			attr_flags)
4065{
4066	xfs_mount_t		*mp = ip->i_mount;
4067	xfs_off_t		count;
4068	xfs_filblks_t		allocated_fsb;
4069	xfs_filblks_t		allocatesize_fsb;
4070	xfs_extlen_t		extsz, temp;
4071	xfs_fileoff_t		startoffset_fsb;
4072	xfs_fsblock_t		firstfsb;
4073	int			nimaps;
4074	int			bmapi_flag;
4075	int			quota_flag;
4076	int			rt;
4077	xfs_trans_t		*tp;
4078	xfs_bmbt_irec_t		imaps[1], *imapp;
4079	xfs_bmap_free_t		free_list;
4080	uint			qblocks, resblks, resrtextents;
4081	int			committed;
4082	int			error;
4083
4084	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4085
4086	if (XFS_FORCED_SHUTDOWN(mp))
4087		return XFS_ERROR(EIO);
4088
4089	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4090		return error;
4091
4092	if (len <= 0)
4093		return XFS_ERROR(EINVAL);
4094
4095	rt = XFS_IS_REALTIME_INODE(ip);
4096	extsz = xfs_get_extsz_hint(ip);
4097
4098	count = len;
4099	imapp = &imaps[0];
4100	nimaps = 1;
4101	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4102	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
4103	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4104
4105	/*	Generate a DMAPI event if needed.	*/
4106	if (alloc_type != 0 && offset < ip->i_size &&
4107			(attr_flags&ATTR_DMI) == 0  &&
4108			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4109		xfs_off_t           end_dmi_offset;
4110
4111		end_dmi_offset = offset+len;
4112		if (end_dmi_offset > ip->i_size)
4113			end_dmi_offset = ip->i_size;
4114		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4115			offset, end_dmi_offset - offset,
4116			0, NULL);
4117		if (error)
4118			return error;
4119	}
4120
4121	/*
4122	 * Allocate file space until done or until there is an error
4123	 */
4124retry:
4125	while (allocatesize_fsb && !error) {
4126		xfs_fileoff_t	s, e;
4127
4128		/*
4129		 * Determine space reservations for data/realtime.
4130		 */
4131		if (unlikely(extsz)) {
4132			s = startoffset_fsb;
4133			do_div(s, extsz);
4134			s *= extsz;
4135			e = startoffset_fsb + allocatesize_fsb;
4136			if ((temp = do_mod(startoffset_fsb, extsz)))
4137				e += temp;
4138			if ((temp = do_mod(e, extsz)))
4139				e += extsz - temp;
4140		} else {
4141			s = 0;
4142			e = allocatesize_fsb;
4143		}
4144
4145		if (unlikely(rt)) {
4146			resrtextents = qblocks = (uint)(e - s);
4147			resrtextents /= mp->m_sb.sb_rextsize;
4148			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4149			quota_flag = XFS_QMOPT_RES_RTBLKS;
4150		} else {
4151			resrtextents = 0;
4152			resblks = qblocks = \
4153				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4154			quota_flag = XFS_QMOPT_RES_REGBLKS;
4155		}
4156
4157		/*
4158		 * Allocate and setup the transaction.
4159		 */
4160		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4161		error = xfs_trans_reserve(tp, resblks,
4162					  XFS_WRITE_LOG_RES(mp), resrtextents,
4163					  XFS_TRANS_PERM_LOG_RES,
4164					  XFS_WRITE_LOG_COUNT);
4165		/*
4166		 * Check for running out of space
4167		 */
4168		if (error) {
4169			/*
4170			 * Free the transaction structure.
4171			 */
4172			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4173			xfs_trans_cancel(tp, 0);
4174			break;
4175		}
4176		xfs_ilock(ip, XFS_ILOCK_EXCL);
4177		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4178						      qblocks, 0, quota_flag);
4179		if (error)
4180			goto error1;
4181
4182		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4183		xfs_trans_ihold(tp, ip);
4184
4185		/*
4186		 * Issue the xfs_bmapi() call to allocate the blocks
4187		 */
4188		XFS_BMAP_INIT(&free_list, &firstfsb);
4189		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4190				  allocatesize_fsb, bmapi_flag,
4191				  &firstfsb, 0, imapp, &nimaps,
4192				  &free_list, NULL);
4193		if (error) {
4194			goto error0;
4195		}
4196
4197		/*
4198		 * Complete the transaction
4199		 */
4200		error = xfs_bmap_finish(&tp, &free_list, &committed);
4201		if (error) {
4202			goto error0;
4203		}
4204
4205		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4206		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4207		if (error) {
4208			break;
4209		}
4210
4211		allocated_fsb = imapp->br_blockcount;
4212
4213		if (nimaps == 0) {
4214			error = XFS_ERROR(ENOSPC);
4215			break;
4216		}
4217
4218		startoffset_fsb += allocated_fsb;
4219		allocatesize_fsb -= allocated_fsb;
4220	}
4221dmapi_enospc_check:
4222	if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4223	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4224
4225		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4226				XFS_ITOV(ip), DM_RIGHT_NULL,
4227				XFS_ITOV(ip), DM_RIGHT_NULL,
4228				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4229		if (error == 0)
4230			goto retry;	/* Maybe DMAPI app. has made space */
4231		/* else fall through with error from XFS_SEND_DATA */
4232	}
4233
4234	return error;
4235
4236error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4237	xfs_bmap_cancel(&free_list);
4238	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4239
4240error1:	/* Just cancel transaction */
4241	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4242	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4243	goto dmapi_enospc_check;
4244}
4245
4246/*
4247 * Zero file bytes between startoff and endoff inclusive.
4248 * The iolock is held exclusive and no blocks are buffered.
4249 */
4250STATIC int
4251xfs_zero_remaining_bytes(
4252	xfs_inode_t		*ip,
4253	xfs_off_t		startoff,
4254	xfs_off_t		endoff)
4255{
4256	xfs_bmbt_irec_t		imap;
4257	xfs_fileoff_t		offset_fsb;
4258	xfs_off_t		lastoffset;
4259	xfs_off_t		offset;
4260	xfs_buf_t		*bp;
4261	xfs_mount_t		*mp = ip->i_mount;
4262	int			nimap;
4263	int			error = 0;
4264
4265	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4266				ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4267				mp->m_rtdev_targp : mp->m_ddev_targp);
4268
4269	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4270		offset_fsb = XFS_B_TO_FSBT(mp, offset);
4271		nimap = 1;
4272		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4273			NULL, 0, &imap, &nimap, NULL, NULL);
4274		if (error || nimap < 1)
4275			break;
4276		ASSERT(imap.br_blockcount >= 1);
4277		ASSERT(imap.br_startoff == offset_fsb);
4278		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4279		if (lastoffset > endoff)
4280			lastoffset = endoff;
4281		if (imap.br_startblock == HOLESTARTBLOCK)
4282			continue;
4283		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4284		if (imap.br_state == XFS_EXT_UNWRITTEN)
4285			continue;
4286		XFS_BUF_UNDONE(bp);
4287		XFS_BUF_UNWRITE(bp);
4288		XFS_BUF_READ(bp);
4289		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4290		xfsbdstrat(mp, bp);
4291		if ((error = xfs_iowait(bp))) {
4292			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4293					  mp, bp, XFS_BUF_ADDR(bp));
4294			break;
4295		}
4296		memset(XFS_BUF_PTR(bp) +
4297			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4298		      0, lastoffset - offset + 1);
4299		XFS_BUF_UNDONE(bp);
4300		XFS_BUF_UNREAD(bp);
4301		XFS_BUF_WRITE(bp);
4302		xfsbdstrat(mp, bp);
4303		if ((error = xfs_iowait(bp))) {
4304			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4305					  mp, bp, XFS_BUF_ADDR(bp));
4306			break;
4307		}
4308	}
4309	xfs_buf_free(bp);
4310	return error;
4311}
4312
4313/*
4314 * xfs_free_file_space()
4315 *      This routine frees disk space for the given file.
4316 *
4317 *	This routine is only called by xfs_change_file_space
4318 *	for an UNRESVSP type call.
4319 *
4320 * RETURNS:
4321 *       0 on success
4322 *      errno on error
4323 *
4324 */
4325STATIC int
4326xfs_free_file_space(
4327	xfs_inode_t		*ip,
4328	xfs_off_t		offset,
4329	xfs_off_t		len,
4330	int			attr_flags)
4331{
4332	bhv_vnode_t		*vp;
4333	int			committed;
4334	int			done;
4335	xfs_off_t		end_dmi_offset;
4336	xfs_fileoff_t		endoffset_fsb;
4337	int			error;
4338	xfs_fsblock_t		firstfsb;
4339	xfs_bmap_free_t		free_list;
4340	xfs_bmbt_irec_t		imap;
4341	xfs_off_t		ioffset;
4342	xfs_extlen_t		mod=0;
4343	xfs_mount_t		*mp;
4344	int			nimap;
4345	uint			resblks;
4346	uint			rounding;
4347	int			rt;
4348	xfs_fileoff_t		startoffset_fsb;
4349	xfs_trans_t		*tp;
4350	int			need_iolock = 1;
4351
4352	vp = XFS_ITOV(ip);
4353	mp = ip->i_mount;
4354
4355	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4356
4357	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4358		return error;
4359
4360	error = 0;
4361	if (len <= 0)	/* if nothing being freed */
4362		return error;
4363	rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4364	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
4365	end_dmi_offset = offset + len;
4366	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4367
4368	if (offset < ip->i_size &&
4369	    (attr_flags & ATTR_DMI) == 0 &&
4370	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4371		if (end_dmi_offset > ip->i_size)
4372			end_dmi_offset = ip->i_size;
4373		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4374				offset, end_dmi_offset - offset,
4375				AT_DELAY_FLAG(attr_flags), NULL);
4376		if (error)
4377			return error;
4378	}
4379
4380	if (attr_flags & ATTR_NOLOCK)
4381		need_iolock = 0;
4382	if (need_iolock) {
4383		xfs_ilock(ip, XFS_IOLOCK_EXCL);
4384		vn_iowait(vp);	/* wait for the completion of any pending DIOs */
4385	}
4386
4387	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4388	ioffset = offset & ~(rounding - 1);
4389
4390	if (VN_CACHED(vp) != 0) {
4391		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4392				ctooff(offtoct(ioffset)), -1);
4393		error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4394				-1, FI_REMAPF_LOCKED);
4395		if (error)
4396			goto out_unlock_iolock;
4397	}
4398
4399	/*
4400	 * Need to zero the stuff we're not freeing, on disk.
4401	 * If its a realtime file & can't use unwritten extents then we
4402	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4403	 * will take care of it for us.
4404	 */
4405	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4406		nimap = 1;
4407		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4408			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4409		if (error)
4410			goto out_unlock_iolock;
4411		ASSERT(nimap == 0 || nimap == 1);
4412		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4413			xfs_daddr_t	block;
4414
4415			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4416			block = imap.br_startblock;
4417			mod = do_div(block, mp->m_sb.sb_rextsize);
4418			if (mod)
4419				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4420		}
4421		nimap = 1;
4422		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4423			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4424		if (error)
4425			goto out_unlock_iolock;
4426		ASSERT(nimap == 0 || nimap == 1);
4427		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4428			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4429			mod++;
4430			if (mod && (mod != mp->m_sb.sb_rextsize))
4431				endoffset_fsb -= mod;
4432		}
4433	}
4434	if ((done = (endoffset_fsb <= startoffset_fsb)))
4435		/*
4436		 * One contiguous piece to clear
4437		 */
4438		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4439	else {
4440		/*
4441		 * Some full blocks, possibly two pieces to clear
4442		 */
4443		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4444			error = xfs_zero_remaining_bytes(ip, offset,
4445				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4446		if (!error &&
4447		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4448			error = xfs_zero_remaining_bytes(ip,
4449				XFS_FSB_TO_B(mp, endoffset_fsb),
4450				offset + len - 1);
4451	}
4452
4453	/*
4454	 * free file space until done or until there is an error
4455	 */
4456	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4457	while (!error && !done) {
4458
4459		/*
4460		 * allocate and setup the transaction. Allow this
4461		 * transaction to dip into the reserve blocks to ensure
4462		 * the freeing of the space succeeds at ENOSPC.
4463		 */
4464		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4465		tp->t_flags |= XFS_TRANS_RESERVE;
4466		error = xfs_trans_reserve(tp,
4467					  resblks,
4468					  XFS_WRITE_LOG_RES(mp),
4469					  0,
4470					  XFS_TRANS_PERM_LOG_RES,
4471					  XFS_WRITE_LOG_COUNT);
4472
4473		/*
4474		 * check for running out of space
4475		 */
4476		if (error) {
4477			/*
4478			 * Free the transaction structure.
4479			 */
4480			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4481			xfs_trans_cancel(tp, 0);
4482			break;
4483		}
4484		xfs_ilock(ip, XFS_ILOCK_EXCL);
4485		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4486				ip->i_udquot, ip->i_gdquot, resblks, 0,
4487				XFS_QMOPT_RES_REGBLKS);
4488		if (error)
4489			goto error1;
4490
4491		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4492		xfs_trans_ihold(tp, ip);
4493
4494		/*
4495		 * issue the bunmapi() call to free the blocks
4496		 */
4497		XFS_BMAP_INIT(&free_list, &firstfsb);
4498		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4499				  endoffset_fsb - startoffset_fsb,
4500				  0, 2, &firstfsb, &free_list, NULL, &done);
4501		if (error) {
4502			goto error0;
4503		}
4504
4505		/*
4506		 * complete the transaction
4507		 */
4508		error = xfs_bmap_finish(&tp, &free_list, &committed);
4509		if (error) {
4510			goto error0;
4511		}
4512
4513		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4514		xfs_iunlock(ip, XFS_ILOCK_EXCL);
4515	}
4516
4517 out_unlock_iolock:
4518	if (need_iolock)
4519		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4520	return error;
4521
4522 error0:
4523	xfs_bmap_cancel(&free_list);
4524 error1:
4525	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4526	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4527		    XFS_ILOCK_EXCL);
4528	return error;
4529}
4530
4531/*
4532 * xfs_change_file_space()
4533 *      This routine allocates or frees disk space for the given file.
4534 *      The user specified parameters are checked for alignment and size
4535 *      limitations.
4536 *
4537 * RETURNS:
4538 *       0 on success
4539 *      errno on error
4540 *
4541 */
4542int
4543xfs_change_file_space(
4544	bhv_desc_t	*bdp,
4545	int		cmd,
4546	xfs_flock64_t	*bf,
4547	xfs_off_t	offset,
4548	cred_t		*credp,
4549	int		attr_flags)
4550{
4551	int		clrprealloc;
4552	int		error;
4553	xfs_fsize_t	fsize;
4554	xfs_inode_t	*ip;
4555	xfs_mount_t	*mp;
4556	int		setprealloc;
4557	xfs_off_t	startoffset;
4558	xfs_off_t	llen;
4559	xfs_trans_t	*tp;
4560	bhv_vattr_t	va;
4561	bhv_vnode_t	*vp;
4562
4563	vp = BHV_TO_VNODE(bdp);
4564	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4565
4566	ip = XFS_BHVTOI(bdp);
4567	mp = ip->i_mount;
4568
4569	/*
4570	 * must be a regular file and have write permission
4571	 */
4572	if (!VN_ISREG(vp))
4573		return XFS_ERROR(EINVAL);
4574
4575	xfs_ilock(ip, XFS_ILOCK_SHARED);
4576
4577	if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4578		xfs_iunlock(ip, XFS_ILOCK_SHARED);
4579		return error;
4580	}
4581
4582	xfs_iunlock(ip, XFS_ILOCK_SHARED);
4583
4584	switch (bf->l_whence) {
4585	case 0: /*SEEK_SET*/
4586		break;
4587	case 1: /*SEEK_CUR*/
4588		bf->l_start += offset;
4589		break;
4590	case 2: /*SEEK_END*/
4591		bf->l_start += ip->i_size;
4592		break;
4593	default:
4594		return XFS_ERROR(EINVAL);
4595	}
4596
4597	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4598
4599	if (   (bf->l_start < 0)
4600	    || (bf->l_start > XFS_MAXIOFFSET(mp))
4601	    || (bf->l_start + llen < 0)
4602	    || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4603		return XFS_ERROR(EINVAL);
4604
4605	bf->l_whence = 0;
4606
4607	startoffset = bf->l_start;
4608	fsize = ip->i_size;
4609
4610	/*
4611	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4612	 * file space.
4613	 * These calls do NOT zero the data space allocated to the file,
4614	 * nor do they change the file size.
4615	 *
4616	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4617	 * space.
4618	 * These calls cause the new file data to be zeroed and the file
4619	 * size to be changed.
4620	 */
4621	setprealloc = clrprealloc = 0;
4622
4623	switch (cmd) {
4624	case XFS_IOC_RESVSP:
4625	case XFS_IOC_RESVSP64:
4626		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4627								1, attr_flags);
4628		if (error)
4629			return error;
4630		setprealloc = 1;
4631		break;
4632
4633	case XFS_IOC_UNRESVSP:
4634	case XFS_IOC_UNRESVSP64:
4635		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4636								attr_flags)))
4637			return error;
4638		break;
4639
4640	case XFS_IOC_ALLOCSP:
4641	case XFS_IOC_ALLOCSP64:
4642	case XFS_IOC_FREESP:
4643	case XFS_IOC_FREESP64:
4644		if (startoffset > fsize) {
4645			error = xfs_alloc_file_space(ip, fsize,
4646					startoffset - fsize, 0, attr_flags);
4647			if (error)
4648				break;
4649		}
4650
4651		va.va_mask = XFS_AT_SIZE;
4652		va.va_size = startoffset;
4653
4654		error = xfs_setattr(bdp, &va, attr_flags, credp);
4655
4656		if (error)
4657			return error;
4658
4659		clrprealloc = 1;
4660		break;
4661
4662	default:
4663		ASSERT(0);
4664		return XFS_ERROR(EINVAL);
4665	}
4666
4667	/*
4668	 * update the inode timestamp, mode, and prealloc flag bits
4669	 */
4670	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4671
4672	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4673				      0, 0, 0))) {
4674		/* ASSERT(0); */
4675		xfs_trans_cancel(tp, 0);
4676		return error;
4677	}
4678
4679	xfs_ilock(ip, XFS_ILOCK_EXCL);
4680
4681	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4682	xfs_trans_ihold(tp, ip);
4683
4684	if ((attr_flags & ATTR_DMI) == 0) {
4685		ip->i_d.di_mode &= ~S_ISUID;
4686
4687		/*
4688		 * Note that we don't have to worry about mandatory
4689		 * file locking being disabled here because we only
4690		 * clear the S_ISGID bit if the Group execute bit is
4691		 * on, but if it was on then mandatory locking wouldn't
4692		 * have been enabled.
4693		 */
4694		if (ip->i_d.di_mode & S_IXGRP)
4695			ip->i_d.di_mode &= ~S_ISGID;
4696
4697		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4698	}
4699	if (setprealloc)
4700		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4701	else if (clrprealloc)
4702		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4703
4704	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4705	xfs_trans_set_sync(tp);
4706
4707	error = xfs_trans_commit(tp, 0);
4708
4709	xfs_iunlock(ip, XFS_ILOCK_EXCL);
4710
4711	return error;
4712}
4713
4714bhv_vnodeops_t xfs_vnodeops = {
4715	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4716	.vop_open		= xfs_open,
4717	.vop_read		= xfs_read,
4718#ifdef HAVE_SPLICE
4719	.vop_splice_read	= xfs_splice_read,
4720	.vop_splice_write	= xfs_splice_write,
4721#endif
4722	.vop_write		= xfs_write,
4723	.vop_ioctl		= xfs_ioctl,
4724	.vop_getattr		= xfs_getattr,
4725	.vop_setattr		= xfs_setattr,
4726	.vop_access		= xfs_access,
4727	.vop_lookup		= xfs_lookup,
4728	.vop_create		= xfs_create,
4729	.vop_remove		= xfs_remove,
4730	.vop_link		= xfs_link,
4731	.vop_rename		= xfs_rename,
4732	.vop_mkdir		= xfs_mkdir,
4733	.vop_rmdir		= xfs_rmdir,
4734	.vop_readdir		= xfs_readdir,
4735	.vop_symlink		= xfs_symlink,
4736	.vop_readlink		= xfs_readlink,
4737	.vop_fsync		= xfs_fsync,
4738	.vop_inactive		= xfs_inactive,
4739	.vop_fid2		= xfs_fid2,
4740	.vop_rwlock		= xfs_rwlock,
4741	.vop_rwunlock		= xfs_rwunlock,
4742	.vop_bmap		= xfs_bmap,
4743	.vop_reclaim		= xfs_reclaim,
4744	.vop_attr_get		= xfs_attr_get,
4745	.vop_attr_set		= xfs_attr_set,
4746	.vop_attr_remove	= xfs_attr_remove,
4747	.vop_attr_list		= xfs_attr_list,
4748	.vop_link_removed	= (vop_link_removed_t)fs_noval,
4749	.vop_vnode_change	= (vop_vnode_change_t)fs_noval,
4750	.vop_tosspages		= fs_tosspages,
4751	.vop_flushinval_pages	= fs_flushinval_pages,
4752	.vop_flush_pages	= fs_flush_pages,
4753	.vop_release		= xfs_release,
4754	.vop_iflush		= xfs_inode_flush,
4755};
Configure Feed

Configure Feed