fs/ceph/file.c at v5.17 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / ceph / file.c
at v5.17 2596 lines 69 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/ceph/ceph_debug.h>
   3#include <linux/ceph/striper.h>
   4
   5#include <linux/module.h>
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/file.h>
   9#include <linux/mount.h>
  10#include <linux/namei.h>
  11#include <linux/writeback.h>
  12#include <linux/falloc.h>
  13#include <linux/iversion.h>
  14#include <linux/ktime.h>
  15
  16#include "super.h"
  17#include "mds_client.h"
  18#include "cache.h"
  19#include "io.h"
  20#include "metric.h"
  21
  22static __le32 ceph_flags_sys2wire(u32 flags)
  23{
  24	u32 wire_flags = 0;
  25
  26	switch (flags & O_ACCMODE) {
  27	case O_RDONLY:
  28		wire_flags |= CEPH_O_RDONLY;
  29		break;
  30	case O_WRONLY:
  31		wire_flags |= CEPH_O_WRONLY;
  32		break;
  33	case O_RDWR:
  34		wire_flags |= CEPH_O_RDWR;
  35		break;
  36	}
  37
  38	flags &= ~O_ACCMODE;
  39
  40#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
  41
  42	ceph_sys2wire(O_CREAT);
  43	ceph_sys2wire(O_EXCL);
  44	ceph_sys2wire(O_TRUNC);
  45	ceph_sys2wire(O_DIRECTORY);
  46	ceph_sys2wire(O_NOFOLLOW);
  47
  48#undef ceph_sys2wire
  49
  50	if (flags)
  51		dout("unused open flags: %x\n", flags);
  52
  53	return cpu_to_le32(wire_flags);
  54}
  55
  56/*
  57 * Ceph file operations
  58 *
  59 * Implement basic open/close functionality, and implement
  60 * read/write.
  61 *
  62 * We implement three modes of file I/O:
  63 *  - buffered uses the generic_file_aio_{read,write} helpers
  64 *
  65 *  - synchronous is used when there is multi-client read/write
  66 *    sharing, avoids the page cache, and synchronously waits for an
  67 *    ack from the OSD.
  68 *
  69 *  - direct io takes the variant of the sync path that references
  70 *    user pages directly.
  71 *
  72 * fsync() flushes and waits on dirty pages, but just queues metadata
  73 * for writeback: since the MDS can recover size and mtime there is no
  74 * need to wait for MDS acknowledgement.
  75 */
  76
  77/*
  78 * How many pages to get in one call to iov_iter_get_pages().  This
  79 * determines the size of the on-stack array used as a buffer.
  80 */
  81#define ITER_GET_BVECS_PAGES	64
  82
  83static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
  84				struct bio_vec *bvecs)
  85{
  86	size_t size = 0;
  87	int bvec_idx = 0;
  88
  89	if (maxsize > iov_iter_count(iter))
  90		maxsize = iov_iter_count(iter);
  91
  92	while (size < maxsize) {
  93		struct page *pages[ITER_GET_BVECS_PAGES];
  94		ssize_t bytes;
  95		size_t start;
  96		int idx = 0;
  97
  98		bytes = iov_iter_get_pages(iter, pages, maxsize - size,
  99					   ITER_GET_BVECS_PAGES, &start);
 100		if (bytes < 0)
 101			return size ?: bytes;
 102
 103		iov_iter_advance(iter, bytes);
 104		size += bytes;
 105
 106		for ( ; bytes; idx++, bvec_idx++) {
 107			struct bio_vec bv = {
 108				.bv_page = pages[idx],
 109				.bv_len = min_t(int, bytes, PAGE_SIZE - start),
 110				.bv_offset = start,
 111			};
 112
 113			bvecs[bvec_idx] = bv;
 114			bytes -= bv.bv_len;
 115			start = 0;
 116		}
 117	}
 118
 119	return size;
 120}
 121
 122/*
 123 * iov_iter_get_pages() only considers one iov_iter segment, no matter
 124 * what maxsize or maxpages are given.  For ITER_BVEC that is a single
 125 * page.
 126 *
 127 * Attempt to get up to @maxsize bytes worth of pages from @iter.
 128 * Return the number of bytes in the created bio_vec array, or an error.
 129 */
 130static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
 131				    struct bio_vec **bvecs, int *num_bvecs)
 132{
 133	struct bio_vec *bv;
 134	size_t orig_count = iov_iter_count(iter);
 135	ssize_t bytes;
 136	int npages;
 137
 138	iov_iter_truncate(iter, maxsize);
 139	npages = iov_iter_npages(iter, INT_MAX);
 140	iov_iter_reexpand(iter, orig_count);
 141
 142	/*
 143	 * __iter_get_bvecs() may populate only part of the array -- zero it
 144	 * out.
 145	 */
 146	bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
 147	if (!bv)
 148		return -ENOMEM;
 149
 150	bytes = __iter_get_bvecs(iter, maxsize, bv);
 151	if (bytes < 0) {
 152		/*
 153		 * No pages were pinned -- just free the array.
 154		 */
 155		kvfree(bv);
 156		return bytes;
 157	}
 158
 159	*bvecs = bv;
 160	*num_bvecs = npages;
 161	return bytes;
 162}
 163
 164static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
 165{
 166	int i;
 167
 168	for (i = 0; i < num_bvecs; i++) {
 169		if (bvecs[i].bv_page) {
 170			if (should_dirty)
 171				set_page_dirty_lock(bvecs[i].bv_page);
 172			put_page(bvecs[i].bv_page);
 173		}
 174	}
 175	kvfree(bvecs);
 176}
 177
 178/*
 179 * Prepare an open request.  Preallocate ceph_cap to avoid an
 180 * inopportune ENOMEM later.
 181 */
 182static struct ceph_mds_request *
 183prepare_open_request(struct super_block *sb, int flags, int create_mode)
 184{
 185	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
 186	struct ceph_mds_request *req;
 187	int want_auth = USE_ANY_MDS;
 188	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
 189
 190	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
 191		want_auth = USE_AUTH_MDS;
 192
 193	req = ceph_mdsc_create_request(mdsc, op, want_auth);
 194	if (IS_ERR(req))
 195		goto out;
 196	req->r_fmode = ceph_flags_to_mode(flags);
 197	req->r_args.open.flags = ceph_flags_sys2wire(flags);
 198	req->r_args.open.mode = cpu_to_le32(create_mode);
 199out:
 200	return req;
 201}
 202
 203static int ceph_init_file_info(struct inode *inode, struct file *file,
 204					int fmode, bool isdir)
 205{
 206	struct ceph_inode_info *ci = ceph_inode(inode);
 207	struct ceph_mount_options *opt =
 208		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
 209	struct ceph_file_info *fi;
 210
 211	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
 212			inode->i_mode, isdir ? "dir" : "regular");
 213	BUG_ON(inode->i_fop->release != ceph_release);
 214
 215	if (isdir) {
 216		struct ceph_dir_file_info *dfi =
 217			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
 218		if (!dfi)
 219			return -ENOMEM;
 220
 221		file->private_data = dfi;
 222		fi = &dfi->file_info;
 223		dfi->next_offset = 2;
 224		dfi->readdir_cache_idx = -1;
 225	} else {
 226		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
 227		if (!fi)
 228			return -ENOMEM;
 229
 230		if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
 231			fi->flags |= CEPH_F_SYNC;
 232
 233		file->private_data = fi;
 234	}
 235
 236	ceph_get_fmode(ci, fmode, 1);
 237	fi->fmode = fmode;
 238
 239	spin_lock_init(&fi->rw_contexts_lock);
 240	INIT_LIST_HEAD(&fi->rw_contexts);
 241	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
 242
 243	return 0;
 244}
 245
 246/*
 247 * initialize private struct file data.
 248 * if we fail, clean up by dropping fmode reference on the ceph_inode
 249 */
 250static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 251{
 252	int ret = 0;
 253
 254	switch (inode->i_mode & S_IFMT) {
 255	case S_IFREG:
 256		ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
 257		fallthrough;
 258	case S_IFDIR:
 259		ret = ceph_init_file_info(inode, file, fmode,
 260						S_ISDIR(inode->i_mode));
 261		break;
 262
 263	case S_IFLNK:
 264		dout("init_file %p %p 0%o (symlink)\n", inode, file,
 265		     inode->i_mode);
 266		break;
 267
 268	default:
 269		dout("init_file %p %p 0%o (special)\n", inode, file,
 270		     inode->i_mode);
 271		/*
 272		 * we need to drop the open ref now, since we don't
 273		 * have .release set to ceph_release.
 274		 */
 275		BUG_ON(inode->i_fop->release == ceph_release);
 276
 277		/* call the proper open fop */
 278		ret = inode->i_fop->open(inode, file);
 279	}
 280	return ret;
 281}
 282
 283/*
 284 * try renew caps after session gets killed.
 285 */
 286int ceph_renew_caps(struct inode *inode, int fmode)
 287{
 288	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 289	struct ceph_inode_info *ci = ceph_inode(inode);
 290	struct ceph_mds_request *req;
 291	int err, flags, wanted;
 292
 293	spin_lock(&ci->i_ceph_lock);
 294	__ceph_touch_fmode(ci, mdsc, fmode);
 295	wanted = __ceph_caps_file_wanted(ci);
 296	if (__ceph_is_any_real_caps(ci) &&
 297	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
 298		int issued = __ceph_caps_issued(ci, NULL);
 299		spin_unlock(&ci->i_ceph_lock);
 300		dout("renew caps %p want %s issued %s updating mds_wanted\n",
 301		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
 302		ceph_check_caps(ci, 0, NULL);
 303		return 0;
 304	}
 305	spin_unlock(&ci->i_ceph_lock);
 306
 307	flags = 0;
 308	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
 309		flags = O_RDWR;
 310	else if (wanted & CEPH_CAP_FILE_RD)
 311		flags = O_RDONLY;
 312	else if (wanted & CEPH_CAP_FILE_WR)
 313		flags = O_WRONLY;
 314#ifdef O_LAZY
 315	if (wanted & CEPH_CAP_FILE_LAZYIO)
 316		flags |= O_LAZY;
 317#endif
 318
 319	req = prepare_open_request(inode->i_sb, flags, 0);
 320	if (IS_ERR(req)) {
 321		err = PTR_ERR(req);
 322		goto out;
 323	}
 324
 325	req->r_inode = inode;
 326	ihold(inode);
 327	req->r_num_caps = 1;
 328
 329	err = ceph_mdsc_do_request(mdsc, NULL, req);
 330	ceph_mdsc_put_request(req);
 331out:
 332	dout("renew caps %p open result=%d\n", inode, err);
 333	return err < 0 ? err : 0;
 334}
 335
 336/*
 337 * If we already have the requisite capabilities, we can satisfy
 338 * the open request locally (no need to request new caps from the
 339 * MDS).  We do, however, need to inform the MDS (asynchronously)
 340 * if our wanted caps set expands.
 341 */
 342int ceph_open(struct inode *inode, struct file *file)
 343{
 344	struct ceph_inode_info *ci = ceph_inode(inode);
 345	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
 346	struct ceph_mds_client *mdsc = fsc->mdsc;
 347	struct ceph_mds_request *req;
 348	struct ceph_file_info *fi = file->private_data;
 349	int err;
 350	int flags, fmode, wanted;
 351
 352	if (fi) {
 353		dout("open file %p is already opened\n", file);
 354		return 0;
 355	}
 356
 357	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
 358	flags = file->f_flags & ~(O_CREAT|O_EXCL);
 359	if (S_ISDIR(inode->i_mode))
 360		flags = O_DIRECTORY;  /* mds likes to know */
 361
 362	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
 363	     ceph_vinop(inode), file, flags, file->f_flags);
 364	fmode = ceph_flags_to_mode(flags);
 365	wanted = ceph_caps_for_mode(fmode);
 366
 367	/* snapped files are read-only */
 368	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
 369		return -EROFS;
 370
 371	/* trivially open snapdir */
 372	if (ceph_snap(inode) == CEPH_SNAPDIR) {
 373		return ceph_init_file(inode, file, fmode);
 374	}
 375
 376	/*
 377	 * No need to block if we have caps on the auth MDS (for
 378	 * write) or any MDS (for read).  Update wanted set
 379	 * asynchronously.
 380	 */
 381	spin_lock(&ci->i_ceph_lock);
 382	if (__ceph_is_any_real_caps(ci) &&
 383	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
 384		int mds_wanted = __ceph_caps_mds_wanted(ci, true);
 385		int issued = __ceph_caps_issued(ci, NULL);
 386
 387		dout("open %p fmode %d want %s issued %s using existing\n",
 388		     inode, fmode, ceph_cap_string(wanted),
 389		     ceph_cap_string(issued));
 390		__ceph_touch_fmode(ci, mdsc, fmode);
 391		spin_unlock(&ci->i_ceph_lock);
 392
 393		/* adjust wanted? */
 394		if ((issued & wanted) != wanted &&
 395		    (mds_wanted & wanted) != wanted &&
 396		    ceph_snap(inode) != CEPH_SNAPDIR)
 397			ceph_check_caps(ci, 0, NULL);
 398
 399		return ceph_init_file(inode, file, fmode);
 400	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 401		   (ci->i_snap_caps & wanted) == wanted) {
 402		__ceph_touch_fmode(ci, mdsc, fmode);
 403		spin_unlock(&ci->i_ceph_lock);
 404		return ceph_init_file(inode, file, fmode);
 405	}
 406
 407	spin_unlock(&ci->i_ceph_lock);
 408
 409	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
 410	req = prepare_open_request(inode->i_sb, flags, 0);
 411	if (IS_ERR(req)) {
 412		err = PTR_ERR(req);
 413		goto out;
 414	}
 415	req->r_inode = inode;
 416	ihold(inode);
 417
 418	req->r_num_caps = 1;
 419	err = ceph_mdsc_do_request(mdsc, NULL, req);
 420	if (!err)
 421		err = ceph_init_file(inode, file, req->r_fmode);
 422	ceph_mdsc_put_request(req);
 423	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
 424out:
 425	return err;
 426}
 427
 428/* Clone the layout from a synchronous create, if the dir now has Dc caps */
 429static void
 430cache_file_layout(struct inode *dst, struct inode *src)
 431{
 432	struct ceph_inode_info *cdst = ceph_inode(dst);
 433	struct ceph_inode_info *csrc = ceph_inode(src);
 434
 435	spin_lock(&cdst->i_ceph_lock);
 436	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
 437	    !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
 438		memcpy(&cdst->i_cached_layout, &csrc->i_layout,
 439			sizeof(cdst->i_cached_layout));
 440		rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
 441				   ceph_try_get_string(csrc->i_layout.pool_ns));
 442	}
 443	spin_unlock(&cdst->i_ceph_lock);
 444}
 445
 446/*
 447 * Try to set up an async create. We need caps, a file layout, and inode number,
 448 * and either a lease on the dentry or complete dir info. If any of those
 449 * criteria are not satisfied, then return false and the caller can go
 450 * synchronous.
 451 */
 452static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
 453				 struct ceph_file_layout *lo, u64 *pino)
 454{
 455	struct ceph_inode_info *ci = ceph_inode(dir);
 456	struct ceph_dentry_info *di = ceph_dentry(dentry);
 457	int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
 458	u64 ino;
 459
 460	spin_lock(&ci->i_ceph_lock);
 461	/* No auth cap means no chance for Dc caps */
 462	if (!ci->i_auth_cap)
 463		goto no_async;
 464
 465	/* Any delegated inos? */
 466	if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
 467		goto no_async;
 468
 469	if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
 470		goto no_async;
 471
 472	if ((__ceph_caps_issued(ci, NULL) & want) != want)
 473		goto no_async;
 474
 475	if (d_in_lookup(dentry)) {
 476		if (!__ceph_dir_is_complete(ci))
 477			goto no_async;
 478		spin_lock(&dentry->d_lock);
 479		di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
 480		spin_unlock(&dentry->d_lock);
 481	} else if (atomic_read(&ci->i_shared_gen) !=
 482		   READ_ONCE(di->lease_shared_gen)) {
 483		goto no_async;
 484	}
 485
 486	ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
 487	if (!ino)
 488		goto no_async;
 489
 490	*pino = ino;
 491	ceph_take_cap_refs(ci, want, false);
 492	memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
 493	rcu_assign_pointer(lo->pool_ns,
 494			   ceph_try_get_string(ci->i_cached_layout.pool_ns));
 495	got = want;
 496no_async:
 497	spin_unlock(&ci->i_ceph_lock);
 498	return got;
 499}
 500
 501static void restore_deleg_ino(struct inode *dir, u64 ino)
 502{
 503	struct ceph_inode_info *ci = ceph_inode(dir);
 504	struct ceph_mds_session *s = NULL;
 505
 506	spin_lock(&ci->i_ceph_lock);
 507	if (ci->i_auth_cap)
 508		s = ceph_get_mds_session(ci->i_auth_cap->session);
 509	spin_unlock(&ci->i_ceph_lock);
 510	if (s) {
 511		int err = ceph_restore_deleg_ino(s, ino);
 512		if (err)
 513			pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
 514				ino, err);
 515		ceph_put_mds_session(s);
 516	}
 517}
 518
 519static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 520                                 struct ceph_mds_request *req)
 521{
 522	int result = req->r_err ? req->r_err :
 523			le32_to_cpu(req->r_reply_info.head->result);
 524
 525	if (result == -EJUKEBOX)
 526		goto out;
 527
 528	mapping_set_error(req->r_parent->i_mapping, result);
 529
 530	if (result) {
 531		struct dentry *dentry = req->r_dentry;
 532		struct inode *inode = d_inode(dentry);
 533		int pathlen = 0;
 534		u64 base = 0;
 535		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
 536						  &base, 0);
 537
 538		ceph_dir_clear_complete(req->r_parent);
 539		if (!d_unhashed(dentry))
 540			d_drop(dentry);
 541
 542		ceph_inode_shutdown(inode);
 543
 544		pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
 545			base, IS_ERR(path) ? "<<bad>>" : path, result);
 546		ceph_mdsc_free_path(path, pathlen);
 547	}
 548
 549	if (req->r_target_inode) {
 550		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
 551		u64 ino = ceph_vino(req->r_target_inode).ino;
 552
 553		if (req->r_deleg_ino != ino)
 554			pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
 555				__func__, req->r_err, req->r_deleg_ino, ino);
 556		mapping_set_error(req->r_target_inode->i_mapping, result);
 557
 558		spin_lock(&ci->i_ceph_lock);
 559		if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
 560			ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
 561			wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
 562		}
 563		ceph_kick_flushing_inode_caps(req->r_session, ci);
 564		spin_unlock(&ci->i_ceph_lock);
 565	} else if (!result) {
 566		pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
 567			req->r_deleg_ino);
 568	}
 569out:
 570	ceph_mdsc_release_dir_caps(req);
 571}
 572
 573static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 574				    struct file *file, umode_t mode,
 575				    struct ceph_mds_request *req,
 576				    struct ceph_acl_sec_ctx *as_ctx,
 577				    struct ceph_file_layout *lo)
 578{
 579	int ret;
 580	char xattr_buf[4];
 581	struct ceph_mds_reply_inode in = { };
 582	struct ceph_mds_reply_info_in iinfo = { .in = &in };
 583	struct ceph_inode_info *ci = ceph_inode(dir);
 584	struct inode *inode;
 585	struct timespec64 now;
 586	struct ceph_string *pool_ns;
 587	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 588	struct ceph_vino vino = { .ino = req->r_deleg_ino,
 589				  .snap = CEPH_NOSNAP };
 590
 591	ktime_get_real_ts64(&now);
 592
 593	inode = ceph_get_inode(dentry->d_sb, vino);
 594	if (IS_ERR(inode))
 595		return PTR_ERR(inode);
 596
 597	iinfo.inline_version = CEPH_INLINE_NONE;
 598	iinfo.change_attr = 1;
 599	ceph_encode_timespec64(&iinfo.btime, &now);
 600
 601	iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
 602	iinfo.xattr_data = xattr_buf;
 603	memset(iinfo.xattr_data, 0, iinfo.xattr_len);
 604
 605	in.ino = cpu_to_le64(vino.ino);
 606	in.snapid = cpu_to_le64(CEPH_NOSNAP);
 607	in.version = cpu_to_le64(1);	// ???
 608	in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
 609	in.cap.cap_id = cpu_to_le64(1);
 610	in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
 611	in.cap.flags = CEPH_CAP_FLAG_AUTH;
 612	in.ctime = in.mtime = in.atime = iinfo.btime;
 613	in.truncate_seq = cpu_to_le32(1);
 614	in.truncate_size = cpu_to_le64(-1ULL);
 615	in.xattr_version = cpu_to_le64(1);
 616	in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
 617	if (dir->i_mode & S_ISGID) {
 618		in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
 619
 620		/* Directories always inherit the setgid bit. */
 621		if (S_ISDIR(mode))
 622			mode |= S_ISGID;
 623		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
 624			 !in_group_p(dir->i_gid) &&
 625			 !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID))
 626			mode &= ~S_ISGID;
 627	} else {
 628		in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
 629	}
 630	in.mode = cpu_to_le32((u32)mode);
 631
 632	in.nlink = cpu_to_le32(1);
 633	in.max_size = cpu_to_le64(lo->stripe_unit);
 634
 635	ceph_file_layout_to_legacy(lo, &in.layout);
 636	/* lo is private, so pool_ns can't change */
 637	pool_ns = rcu_dereference_raw(lo->pool_ns);
 638	if (pool_ns) {
 639		iinfo.pool_ns_len = pool_ns->len;
 640		iinfo.pool_ns_data = pool_ns->str;
 641	}
 642
 643	down_read(&mdsc->snap_rwsem);
 644	ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
 645			      req->r_fmode, NULL);
 646	up_read(&mdsc->snap_rwsem);
 647	if (ret) {
 648		dout("%s failed to fill inode: %d\n", __func__, ret);
 649		ceph_dir_clear_complete(dir);
 650		if (!d_unhashed(dentry))
 651			d_drop(dentry);
 652		if (inode->i_state & I_NEW)
 653			discard_new_inode(inode);
 654	} else {
 655		struct dentry *dn;
 656
 657		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
 658			vino.ino, ceph_ino(dir), dentry->d_name.name);
 659		ceph_dir_clear_ordered(dir);
 660		ceph_init_inode_acls(inode, as_ctx);
 661		if (inode->i_state & I_NEW) {
 662			/*
 663			 * If it's not I_NEW, then someone created this before
 664			 * we got here. Assume the server is aware of it at
 665			 * that point and don't worry about setting
 666			 * CEPH_I_ASYNC_CREATE.
 667			 */
 668			ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
 669			unlock_new_inode(inode);
 670		}
 671		if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
 672			if (!d_unhashed(dentry))
 673				d_drop(dentry);
 674			dn = d_splice_alias(inode, dentry);
 675			WARN_ON_ONCE(dn && dn != dentry);
 676		}
 677		file->f_mode |= FMODE_CREATED;
 678		ret = finish_open(file, dentry, ceph_open);
 679	}
 680	return ret;
 681}
 682
 683/*
 684 * Do a lookup + open with a single request.  If we get a non-existent
 685 * file or symlink, return 1 so the VFS can retry.
 686 */
 687int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 688		     struct file *file, unsigned flags, umode_t mode)
 689{
 690	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 691	struct ceph_mds_client *mdsc = fsc->mdsc;
 692	struct ceph_mds_request *req;
 693	struct dentry *dn;
 694	struct ceph_acl_sec_ctx as_ctx = {};
 695	bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
 696	int mask;
 697	int err;
 698
 699	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
 700	     dir, dentry, dentry,
 701	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
 702
 703	if (dentry->d_name.len > NAME_MAX)
 704		return -ENAMETOOLONG;
 705
 706	if (flags & O_CREAT) {
 707		if (ceph_quota_is_max_files_exceeded(dir))
 708			return -EDQUOT;
 709		err = ceph_pre_init_acls(dir, &mode, &as_ctx);
 710		if (err < 0)
 711			return err;
 712		err = ceph_security_init_secctx(dentry, mode, &as_ctx);
 713		if (err < 0)
 714			goto out_ctx;
 715	} else if (!d_in_lookup(dentry)) {
 716		/* If it's not being looked up, it's negative */
 717		return -ENOENT;
 718	}
 719retry:
 720	/* do the open */
 721	req = prepare_open_request(dir->i_sb, flags, mode);
 722	if (IS_ERR(req)) {
 723		err = PTR_ERR(req);
 724		goto out_ctx;
 725	}
 726	req->r_dentry = dget(dentry);
 727	req->r_num_caps = 2;
 728	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
 729	if (ceph_security_xattr_wanted(dir))
 730		mask |= CEPH_CAP_XATTR_SHARED;
 731	req->r_args.open.mask = cpu_to_le32(mask);
 732	req->r_parent = dir;
 733	ihold(dir);
 734
 735	if (flags & O_CREAT) {
 736		struct ceph_file_layout lo;
 737
 738		req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
 739		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 740		if (as_ctx.pagelist) {
 741			req->r_pagelist = as_ctx.pagelist;
 742			as_ctx.pagelist = NULL;
 743		}
 744		if (try_async &&
 745		    (req->r_dir_caps =
 746		      try_prep_async_create(dir, dentry, &lo,
 747					    &req->r_deleg_ino))) {
 748			set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
 749			req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
 750			req->r_callback = ceph_async_create_cb;
 751			err = ceph_mdsc_submit_request(mdsc, dir, req);
 752			if (!err) {
 753				err = ceph_finish_async_create(dir, dentry,
 754							file, mode, req,
 755							&as_ctx, &lo);
 756			} else if (err == -EJUKEBOX) {
 757				restore_deleg_ino(dir, req->r_deleg_ino);
 758				ceph_mdsc_put_request(req);
 759				try_async = false;
 760				ceph_put_string(rcu_dereference_raw(lo.pool_ns));
 761				goto retry;
 762			}
 763			ceph_put_string(rcu_dereference_raw(lo.pool_ns));
 764			goto out_req;
 765		}
 766	}
 767
 768	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
 769	err = ceph_mdsc_do_request(mdsc,
 770				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 771				   req);
 772	if (err == -ENOENT) {
 773		dentry = ceph_handle_snapdir(req, dentry);
 774		if (IS_ERR(dentry)) {
 775			err = PTR_ERR(dentry);
 776			goto out_req;
 777		}
 778		err = 0;
 779	}
 780
 781	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 782		err = ceph_handle_notrace_create(dir, dentry);
 783
 784	if (d_in_lookup(dentry)) {
 785		dn = ceph_finish_lookup(req, dentry, err);
 786		if (IS_ERR(dn))
 787			err = PTR_ERR(dn);
 788	} else {
 789		/* we were given a hashed negative dentry */
 790		dn = NULL;
 791	}
 792	if (err)
 793		goto out_req;
 794	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
 795		/* make vfs retry on splice, ENOENT, or symlink */
 796		dout("atomic_open finish_no_open on dn %p\n", dn);
 797		err = finish_no_open(file, dn);
 798	} else {
 799		dout("atomic_open finish_open on dn %p\n", dn);
 800		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
 801			struct inode *newino = d_inode(dentry);
 802
 803			cache_file_layout(dir, newino);
 804			ceph_init_inode_acls(newino, &as_ctx);
 805			file->f_mode |= FMODE_CREATED;
 806		}
 807		err = finish_open(file, dentry, ceph_open);
 808	}
 809out_req:
 810	ceph_mdsc_put_request(req);
 811out_ctx:
 812	ceph_release_acl_sec_ctx(&as_ctx);
 813	dout("atomic_open result=%d\n", err);
 814	return err;
 815}
 816
 817int ceph_release(struct inode *inode, struct file *file)
 818{
 819	struct ceph_inode_info *ci = ceph_inode(inode);
 820
 821	if (S_ISDIR(inode->i_mode)) {
 822		struct ceph_dir_file_info *dfi = file->private_data;
 823		dout("release inode %p dir file %p\n", inode, file);
 824		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
 825
 826		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
 827
 828		if (dfi->last_readdir)
 829			ceph_mdsc_put_request(dfi->last_readdir);
 830		kfree(dfi->last_name);
 831		kfree(dfi->dir_info);
 832		kmem_cache_free(ceph_dir_file_cachep, dfi);
 833	} else {
 834		struct ceph_file_info *fi = file->private_data;
 835		dout("release inode %p regular file %p\n", inode, file);
 836		WARN_ON(!list_empty(&fi->rw_contexts));
 837
 838		ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
 839		ceph_put_fmode(ci, fi->fmode, 1);
 840
 841		kmem_cache_free(ceph_file_cachep, fi);
 842	}
 843
 844	/* wake up anyone waiting for caps on this inode */
 845	wake_up_all(&ci->i_cap_wq);
 846	return 0;
 847}
 848
 849enum {
 850	HAVE_RETRIED = 1,
 851	CHECK_EOF =    2,
 852	READ_INLINE =  3,
 853};
 854
 855/*
 856 * Completely synchronous read and write methods.  Direct from __user
 857 * buffer to osd, or directly to user pages (if O_DIRECT).
 858 *
 859 * If the read spans object boundary, just do multiple reads.  (That's not
 860 * atomic, but good enough for now.)
 861 *
 862 * If we get a short result from the OSD, check against i_size; we need to
 863 * only return a short read to the caller if we hit EOF.
 864 */
 865static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 866			      int *retry_op)
 867{
 868	struct file *file = iocb->ki_filp;
 869	struct inode *inode = file_inode(file);
 870	struct ceph_inode_info *ci = ceph_inode(inode);
 871	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 872	struct ceph_osd_client *osdc = &fsc->client->osdc;
 873	ssize_t ret;
 874	u64 off = iocb->ki_pos;
 875	u64 len = iov_iter_count(to);
 876	u64 i_size = i_size_read(inode);
 877
 878	dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
 879	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 880
 881	if (!len)
 882		return 0;
 883	/*
 884	 * flush any page cache pages in this range.  this
 885	 * will make concurrent normal and sync io slow,
 886	 * but it will at least behave sensibly when they are
 887	 * in sequence.
 888	 */
 889	ret = filemap_write_and_wait_range(inode->i_mapping,
 890					   off, off + len - 1);
 891	if (ret < 0)
 892		return ret;
 893
 894	ret = 0;
 895	while ((len = iov_iter_count(to)) > 0) {
 896		struct ceph_osd_request *req;
 897		struct page **pages;
 898		int num_pages;
 899		size_t page_off;
 900		bool more;
 901		int idx;
 902		size_t left;
 903
 904		req = ceph_osdc_new_request(osdc, &ci->i_layout,
 905					ci->i_vino, off, &len, 0, 1,
 906					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 907					NULL, ci->i_truncate_seq,
 908					ci->i_truncate_size, false);
 909		if (IS_ERR(req)) {
 910			ret = PTR_ERR(req);
 911			break;
 912		}
 913
 914		more = len < iov_iter_count(to);
 915
 916		num_pages = calc_pages_for(off, len);
 917		page_off = off & ~PAGE_MASK;
 918		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 919		if (IS_ERR(pages)) {
 920			ceph_osdc_put_request(req);
 921			ret = PTR_ERR(pages);
 922			break;
 923		}
 924
 925		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
 926						 false, false);
 927		ret = ceph_osdc_start_request(osdc, req, false);
 928		if (!ret)
 929			ret = ceph_osdc_wait_request(osdc, req);
 930
 931		ceph_update_read_metrics(&fsc->mdsc->metric,
 932					 req->r_start_latency,
 933					 req->r_end_latency,
 934					 len, ret);
 935
 936		ceph_osdc_put_request(req);
 937
 938		i_size = i_size_read(inode);
 939		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
 940		     off, len, ret, i_size, (more ? " MORE" : ""));
 941
 942		if (ret == -ENOENT)
 943			ret = 0;
 944		if (ret >= 0 && ret < len && (off + ret < i_size)) {
 945			int zlen = min(len - ret, i_size - off - ret);
 946			int zoff = page_off + ret;
 947			dout("sync_read zero gap %llu~%llu\n",
 948                             off + ret, off + ret + zlen);
 949			ceph_zero_page_vector_range(zoff, zlen, pages);
 950			ret += zlen;
 951		}
 952
 953		idx = 0;
 954		left = ret > 0 ? ret : 0;
 955		while (left > 0) {
 956			size_t len, copied;
 957			page_off = off & ~PAGE_MASK;
 958			len = min_t(size_t, left, PAGE_SIZE - page_off);
 959			SetPageUptodate(pages[idx]);
 960			copied = copy_page_to_iter(pages[idx++],
 961						   page_off, len, to);
 962			off += copied;
 963			left -= copied;
 964			if (copied < len) {
 965				ret = -EFAULT;
 966				break;
 967			}
 968		}
 969		ceph_release_page_vector(pages, num_pages);
 970
 971		if (ret < 0) {
 972			if (ret == -EBLOCKLISTED)
 973				fsc->blocklisted = true;
 974			break;
 975		}
 976
 977		if (off >= i_size || !more)
 978			break;
 979	}
 980
 981	if (off > iocb->ki_pos) {
 982		if (off >= i_size) {
 983			*retry_op = CHECK_EOF;
 984			ret = i_size - iocb->ki_pos;
 985			iocb->ki_pos = i_size;
 986		} else {
 987			ret = off - iocb->ki_pos;
 988			iocb->ki_pos = off;
 989		}
 990	}
 991
 992	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
 993	return ret;
 994}
 995
 996struct ceph_aio_request {
 997	struct kiocb *iocb;
 998	size_t total_len;
 999	bool write;
1000	bool should_dirty;
1001	int error;
1002	struct list_head osd_reqs;
1003	unsigned num_reqs;
1004	atomic_t pending_reqs;
1005	struct timespec64 mtime;
1006	struct ceph_cap_flush *prealloc_cf;
1007};
1008
1009struct ceph_aio_work {
1010	struct work_struct work;
1011	struct ceph_osd_request *req;
1012};
1013
1014static void ceph_aio_retry_work(struct work_struct *work);
1015
1016static void ceph_aio_complete(struct inode *inode,
1017			      struct ceph_aio_request *aio_req)
1018{
1019	struct ceph_inode_info *ci = ceph_inode(inode);
1020	int ret;
1021
1022	if (!atomic_dec_and_test(&aio_req->pending_reqs))
1023		return;
1024
1025	if (aio_req->iocb->ki_flags & IOCB_DIRECT)
1026		inode_dio_end(inode);
1027
1028	ret = aio_req->error;
1029	if (!ret)
1030		ret = aio_req->total_len;
1031
1032	dout("ceph_aio_complete %p rc %d\n", inode, ret);
1033
1034	if (ret >= 0 && aio_req->write) {
1035		int dirty;
1036
1037		loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
1038		if (endoff > i_size_read(inode)) {
1039			if (ceph_inode_set_size(inode, endoff))
1040				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1041		}
1042
1043		spin_lock(&ci->i_ceph_lock);
1044		ci->i_inline_version = CEPH_INLINE_NONE;
1045		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1046					       &aio_req->prealloc_cf);
1047		spin_unlock(&ci->i_ceph_lock);
1048		if (dirty)
1049			__mark_inode_dirty(inode, dirty);
1050
1051	}
1052
1053	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
1054						CEPH_CAP_FILE_RD));
1055
1056	aio_req->iocb->ki_complete(aio_req->iocb, ret);
1057
1058	ceph_free_cap_flush(aio_req->prealloc_cf);
1059	kfree(aio_req);
1060}
1061
1062static void ceph_aio_complete_req(struct ceph_osd_request *req)
1063{
1064	int rc = req->r_result;
1065	struct inode *inode = req->r_inode;
1066	struct ceph_aio_request *aio_req = req->r_priv;
1067	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
1068	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
1069	unsigned int len = osd_data->bvec_pos.iter.bi_size;
1070
1071	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
1072	BUG_ON(!osd_data->num_bvecs);
1073
1074	dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
1075
1076	if (rc == -EOLDSNAPC) {
1077		struct ceph_aio_work *aio_work;
1078		BUG_ON(!aio_req->write);
1079
1080		aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
1081		if (aio_work) {
1082			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
1083			aio_work->req = req;
1084			queue_work(ceph_inode_to_client(inode)->inode_wq,
1085				   &aio_work->work);
1086			return;
1087		}
1088		rc = -ENOMEM;
1089	} else if (!aio_req->write) {
1090		if (rc == -ENOENT)
1091			rc = 0;
1092		if (rc >= 0 && len > rc) {
1093			struct iov_iter i;
1094			int zlen = len - rc;
1095
1096			/*
1097			 * If read is satisfied by single OSD request,
1098			 * it can pass EOF. Otherwise read is within
1099			 * i_size.
1100			 */
1101			if (aio_req->num_reqs == 1) {
1102				loff_t i_size = i_size_read(inode);
1103				loff_t endoff = aio_req->iocb->ki_pos + rc;
1104				if (endoff < i_size)
1105					zlen = min_t(size_t, zlen,
1106						     i_size - endoff);
1107				aio_req->total_len = rc + zlen;
1108			}
1109
1110			iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
1111				      osd_data->num_bvecs, len);
1112			iov_iter_advance(&i, rc);
1113			iov_iter_zero(zlen, &i);
1114		}
1115	}
1116
1117	/* r_start_latency == 0 means the request was not submitted */
1118	if (req->r_start_latency) {
1119		if (aio_req->write)
1120			ceph_update_write_metrics(metric, req->r_start_latency,
1121						  req->r_end_latency, len, rc);
1122		else
1123			ceph_update_read_metrics(metric, req->r_start_latency,
1124						 req->r_end_latency, len, rc);
1125	}
1126
1127	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
1128		  aio_req->should_dirty);
1129	ceph_osdc_put_request(req);
1130
1131	if (rc < 0)
1132		cmpxchg(&aio_req->error, 0, rc);
1133
1134	ceph_aio_complete(inode, aio_req);
1135	return;
1136}
1137
1138static void ceph_aio_retry_work(struct work_struct *work)
1139{
1140	struct ceph_aio_work *aio_work =
1141		container_of(work, struct ceph_aio_work, work);
1142	struct ceph_osd_request *orig_req = aio_work->req;
1143	struct ceph_aio_request *aio_req = orig_req->r_priv;
1144	struct inode *inode = orig_req->r_inode;
1145	struct ceph_inode_info *ci = ceph_inode(inode);
1146	struct ceph_snap_context *snapc;
1147	struct ceph_osd_request *req;
1148	int ret;
1149
1150	spin_lock(&ci->i_ceph_lock);
1151	if (__ceph_have_pending_cap_snap(ci)) {
1152		struct ceph_cap_snap *capsnap =
1153			list_last_entry(&ci->i_cap_snaps,
1154					struct ceph_cap_snap,
1155					ci_item);
1156		snapc = ceph_get_snap_context(capsnap->context);
1157	} else {
1158		BUG_ON(!ci->i_head_snapc);
1159		snapc = ceph_get_snap_context(ci->i_head_snapc);
1160	}
1161	spin_unlock(&ci->i_ceph_lock);
1162
1163	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
1164			false, GFP_NOFS);
1165	if (!req) {
1166		ret = -ENOMEM;
1167		req = orig_req;
1168		goto out;
1169	}
1170
1171	req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
1172	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
1173	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
1174
1175	req->r_ops[0] = orig_req->r_ops[0];
1176
1177	req->r_mtime = aio_req->mtime;
1178	req->r_data_offset = req->r_ops[0].extent.offset;
1179
1180	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
1181	if (ret) {
1182		ceph_osdc_put_request(req);
1183		req = orig_req;
1184		goto out;
1185	}
1186
1187	ceph_osdc_put_request(orig_req);
1188
1189	req->r_callback = ceph_aio_complete_req;
1190	req->r_inode = inode;
1191	req->r_priv = aio_req;
1192
1193	ret = ceph_osdc_start_request(req->r_osdc, req, false);
1194out:
1195	if (ret < 0) {
1196		req->r_result = ret;
1197		ceph_aio_complete_req(req);
1198	}
1199
1200	ceph_put_snap_context(snapc);
1201	kfree(aio_work);
1202}
1203
1204static ssize_t
1205ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
1206		       struct ceph_snap_context *snapc,
1207		       struct ceph_cap_flush **pcf)
1208{
1209	struct file *file = iocb->ki_filp;
1210	struct inode *inode = file_inode(file);
1211	struct ceph_inode_info *ci = ceph_inode(inode);
1212	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1213	struct ceph_client_metric *metric = &fsc->mdsc->metric;
1214	struct ceph_vino vino;
1215	struct ceph_osd_request *req;
1216	struct bio_vec *bvecs;
1217	struct ceph_aio_request *aio_req = NULL;
1218	int num_pages = 0;
1219	int flags;
1220	int ret = 0;
1221	struct timespec64 mtime = current_time(inode);
1222	size_t count = iov_iter_count(iter);
1223	loff_t pos = iocb->ki_pos;
1224	bool write = iov_iter_rw(iter) == WRITE;
1225	bool should_dirty = !write && iter_is_iovec(iter);
1226
1227	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
1228		return -EROFS;
1229
1230	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
1231	     (write ? "write" : "read"), file, pos, (unsigned)count,
1232	     snapc, snapc ? snapc->seq : 0);
1233
1234	if (write) {
1235		int ret2;
1236
1237		ceph_fscache_invalidate(inode, true);
1238
1239		ret2 = invalidate_inode_pages2_range(inode->i_mapping,
1240					pos >> PAGE_SHIFT,
1241					(pos + count - 1) >> PAGE_SHIFT);
1242		if (ret2 < 0)
1243			dout("invalidate_inode_pages2_range returned %d\n", ret2);
1244
1245		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
1246	} else {
1247		flags = CEPH_OSD_FLAG_READ;
1248	}
1249
1250	while (iov_iter_count(iter) > 0) {
1251		u64 size = iov_iter_count(iter);
1252		ssize_t len;
1253
1254		if (write)
1255			size = min_t(u64, size, fsc->mount_options->wsize);
1256		else
1257			size = min_t(u64, size, fsc->mount_options->rsize);
1258
1259		vino = ceph_vino(inode);
1260		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1261					    vino, pos, &size, 0,
1262					    1,
1263					    write ? CEPH_OSD_OP_WRITE :
1264						    CEPH_OSD_OP_READ,
1265					    flags, snapc,
1266					    ci->i_truncate_seq,
1267					    ci->i_truncate_size,
1268					    false);
1269		if (IS_ERR(req)) {
1270			ret = PTR_ERR(req);
1271			break;
1272		}
1273
1274		len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
1275		if (len < 0) {
1276			ceph_osdc_put_request(req);
1277			ret = len;
1278			break;
1279		}
1280		if (len != size)
1281			osd_req_op_extent_update(req, 0, len);
1282
1283		/*
1284		 * To simplify error handling, allow AIO when IO within i_size
1285		 * or IO can be satisfied by single OSD request.
1286		 */
1287		if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
1288		    (len == count || pos + count <= i_size_read(inode))) {
1289			aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
1290			if (aio_req) {
1291				aio_req->iocb = iocb;
1292				aio_req->write = write;
1293				aio_req->should_dirty = should_dirty;
1294				INIT_LIST_HEAD(&aio_req->osd_reqs);
1295				if (write) {
1296					aio_req->mtime = mtime;
1297					swap(aio_req->prealloc_cf, *pcf);
1298				}
1299			}
1300			/* ignore error */
1301		}
1302
1303		if (write) {
1304			/*
1305			 * throw out any page cache pages in this range. this
1306			 * may block.
1307			 */
1308			truncate_inode_pages_range(inode->i_mapping, pos,
1309						   PAGE_ALIGN(pos + len) - 1);
1310
1311			req->r_mtime = mtime;
1312		}
1313
1314		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
1315
1316		if (aio_req) {
1317			aio_req->total_len += len;
1318			aio_req->num_reqs++;
1319			atomic_inc(&aio_req->pending_reqs);
1320
1321			req->r_callback = ceph_aio_complete_req;
1322			req->r_inode = inode;
1323			req->r_priv = aio_req;
1324			list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
1325
1326			pos += len;
1327			continue;
1328		}
1329
1330		ret = ceph_osdc_start_request(req->r_osdc, req, false);
1331		if (!ret)
1332			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
1333
1334		if (write)
1335			ceph_update_write_metrics(metric, req->r_start_latency,
1336						  req->r_end_latency, len, ret);
1337		else
1338			ceph_update_read_metrics(metric, req->r_start_latency,
1339						 req->r_end_latency, len, ret);
1340
1341		size = i_size_read(inode);
1342		if (!write) {
1343			if (ret == -ENOENT)
1344				ret = 0;
1345			if (ret >= 0 && ret < len && pos + ret < size) {
1346				struct iov_iter i;
1347				int zlen = min_t(size_t, len - ret,
1348						 size - pos - ret);
1349
1350				iov_iter_bvec(&i, READ, bvecs, num_pages, len);
1351				iov_iter_advance(&i, ret);
1352				iov_iter_zero(zlen, &i);
1353				ret += zlen;
1354			}
1355			if (ret >= 0)
1356				len = ret;
1357		}
1358
1359		put_bvecs(bvecs, num_pages, should_dirty);
1360		ceph_osdc_put_request(req);
1361		if (ret < 0)
1362			break;
1363
1364		pos += len;
1365		if (!write && pos >= size)
1366			break;
1367
1368		if (write && pos > size) {
1369			if (ceph_inode_set_size(inode, pos))
1370				ceph_check_caps(ceph_inode(inode),
1371						CHECK_CAPS_AUTHONLY,
1372						NULL);
1373		}
1374	}
1375
1376	if (aio_req) {
1377		LIST_HEAD(osd_reqs);
1378
1379		if (aio_req->num_reqs == 0) {
1380			kfree(aio_req);
1381			return ret;
1382		}
1383
1384		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
1385					      CEPH_CAP_FILE_RD);
1386
1387		list_splice(&aio_req->osd_reqs, &osd_reqs);
1388		inode_dio_begin(inode);
1389		while (!list_empty(&osd_reqs)) {
1390			req = list_first_entry(&osd_reqs,
1391					       struct ceph_osd_request,
1392					       r_private_item);
1393			list_del_init(&req->r_private_item);
1394			if (ret >= 0)
1395				ret = ceph_osdc_start_request(req->r_osdc,
1396							      req, false);
1397			if (ret < 0) {
1398				req->r_result = ret;
1399				ceph_aio_complete_req(req);
1400			}
1401		}
1402		return -EIOCBQUEUED;
1403	}
1404
1405	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
1406		ret = pos - iocb->ki_pos;
1407		iocb->ki_pos = pos;
1408	}
1409	return ret;
1410}
1411
1412/*
1413 * Synchronous write, straight from __user pointer or user pages.
1414 *
1415 * If write spans object boundary, just do multiple writes.  (For a
1416 * correct atomic write, we should e.g. take write locks on all
1417 * objects, rollback on failure, etc.)
1418 */
1419static ssize_t
1420ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
1421		struct ceph_snap_context *snapc)
1422{
1423	struct file *file = iocb->ki_filp;
1424	struct inode *inode = file_inode(file);
1425	struct ceph_inode_info *ci = ceph_inode(inode);
1426	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1427	struct ceph_vino vino;
1428	struct ceph_osd_request *req;
1429	struct page **pages;
1430	u64 len;
1431	int num_pages;
1432	int written = 0;
1433	int flags;
1434	int ret;
1435	bool check_caps = false;
1436	struct timespec64 mtime = current_time(inode);
1437	size_t count = iov_iter_count(from);
1438
1439	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
1440		return -EROFS;
1441
1442	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
1443	     file, pos, (unsigned)count, snapc, snapc->seq);
1444
1445	ret = filemap_write_and_wait_range(inode->i_mapping,
1446					   pos, pos + count - 1);
1447	if (ret < 0)
1448		return ret;
1449
1450	ceph_fscache_invalidate(inode, false);
1451	ret = invalidate_inode_pages2_range(inode->i_mapping,
1452					    pos >> PAGE_SHIFT,
1453					    (pos + count - 1) >> PAGE_SHIFT);
1454	if (ret < 0)
1455		dout("invalidate_inode_pages2_range returned %d\n", ret);
1456
1457	flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
1458
1459	while ((len = iov_iter_count(from)) > 0) {
1460		size_t left;
1461		int n;
1462
1463		vino = ceph_vino(inode);
1464		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1465					    vino, pos, &len, 0, 1,
1466					    CEPH_OSD_OP_WRITE, flags, snapc,
1467					    ci->i_truncate_seq,
1468					    ci->i_truncate_size,
1469					    false);
1470		if (IS_ERR(req)) {
1471			ret = PTR_ERR(req);
1472			break;
1473		}
1474
1475		/*
1476		 * write from beginning of first page,
1477		 * regardless of io alignment
1478		 */
1479		num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1480
1481		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1482		if (IS_ERR(pages)) {
1483			ret = PTR_ERR(pages);
1484			goto out;
1485		}
1486
1487		left = len;
1488		for (n = 0; n < num_pages; n++) {
1489			size_t plen = min_t(size_t, left, PAGE_SIZE);
1490			ret = copy_page_from_iter(pages[n], 0, plen, from);
1491			if (ret != plen) {
1492				ret = -EFAULT;
1493				break;
1494			}
1495			left -= ret;
1496		}
1497
1498		if (ret < 0) {
1499			ceph_release_page_vector(pages, num_pages);
1500			goto out;
1501		}
1502
1503		req->r_inode = inode;
1504
1505		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
1506						false, true);
1507
1508		req->r_mtime = mtime;
1509		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1510		if (!ret)
1511			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
1512
1513		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
1514					  req->r_end_latency, len, ret);
1515out:
1516		ceph_osdc_put_request(req);
1517		if (ret != 0) {
1518			ceph_set_error_write(ci);
1519			break;
1520		}
1521
1522		ceph_clear_error_write(ci);
1523		pos += len;
1524		written += len;
1525		if (pos > i_size_read(inode)) {
1526			check_caps = ceph_inode_set_size(inode, pos);
1527			if (check_caps)
1528				ceph_check_caps(ceph_inode(inode),
1529						CHECK_CAPS_AUTHONLY,
1530						NULL);
1531		}
1532
1533	}
1534
1535	if (ret != -EOLDSNAPC && written > 0) {
1536		ret = written;
1537		iocb->ki_pos = pos;
1538	}
1539	return ret;
1540}
1541
1542/*
1543 * Wrap generic_file_aio_read with checks for cap bits on the inode.
1544 * Atomically grab references, so that those bits are not released
1545 * back to the MDS mid-read.
1546 *
1547 * Hmm, the sync read case isn't actually async... should it be?
1548 */
1549static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
1550{
1551	struct file *filp = iocb->ki_filp;
1552	struct ceph_file_info *fi = filp->private_data;
1553	size_t len = iov_iter_count(to);
1554	struct inode *inode = file_inode(filp);
1555	struct ceph_inode_info *ci = ceph_inode(inode);
1556	bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
1557	ssize_t ret;
1558	int want = 0, got = 0;
1559	int retry_op = 0, read = 0;
1560
1561again:
1562	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
1563	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
1564
1565	if (ceph_inode_is_shutdown(inode))
1566		return -ESTALE;
1567
1568	if (direct_lock)
1569		ceph_start_io_direct(inode);
1570	else
1571		ceph_start_io_read(inode);
1572
1573	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
1574		want |= CEPH_CAP_FILE_CACHE;
1575	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1576		want |= CEPH_CAP_FILE_LAZYIO;
1577
1578	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
1579	if (ret < 0) {
1580		if (direct_lock)
1581			ceph_end_io_direct(inode);
1582		else
1583			ceph_end_io_read(inode);
1584		return ret;
1585	}
1586
1587	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
1588	    (iocb->ki_flags & IOCB_DIRECT) ||
1589	    (fi->flags & CEPH_F_SYNC)) {
1590
1591		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
1592		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
1593		     ceph_cap_string(got));
1594
1595		if (ci->i_inline_version == CEPH_INLINE_NONE) {
1596			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
1597				ret = ceph_direct_read_write(iocb, to,
1598							     NULL, NULL);
1599				if (ret >= 0 && ret < len)
1600					retry_op = CHECK_EOF;
1601			} else {
1602				ret = ceph_sync_read(iocb, to, &retry_op);
1603			}
1604		} else {
1605			retry_op = READ_INLINE;
1606		}
1607	} else {
1608		CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1609		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
1610		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
1611		     ceph_cap_string(got));
1612		ceph_add_rw_context(fi, &rw_ctx);
1613		ret = generic_file_read_iter(iocb, to);
1614		ceph_del_rw_context(fi, &rw_ctx);
1615	}
1616
1617	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
1618	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
1619	ceph_put_cap_refs(ci, got);
1620
1621	if (direct_lock)
1622		ceph_end_io_direct(inode);
1623	else
1624		ceph_end_io_read(inode);
1625
1626	if (retry_op > HAVE_RETRIED && ret >= 0) {
1627		int statret;
1628		struct page *page = NULL;
1629		loff_t i_size;
1630		if (retry_op == READ_INLINE) {
1631			page = __page_cache_alloc(GFP_KERNEL);
1632			if (!page)
1633				return -ENOMEM;
1634		}
1635
1636		statret = __ceph_do_getattr(inode, page,
1637					    CEPH_STAT_CAP_INLINE_DATA, !!page);
1638		if (statret < 0) {
1639			if (page)
1640				__free_page(page);
1641			if (statret == -ENODATA) {
1642				BUG_ON(retry_op != READ_INLINE);
1643				goto again;
1644			}
1645			return statret;
1646		}
1647
1648		i_size = i_size_read(inode);
1649		if (retry_op == READ_INLINE) {
1650			BUG_ON(ret > 0 || read > 0);
1651			if (iocb->ki_pos < i_size &&
1652			    iocb->ki_pos < PAGE_SIZE) {
1653				loff_t end = min_t(loff_t, i_size,
1654						   iocb->ki_pos + len);
1655				end = min_t(loff_t, end, PAGE_SIZE);
1656				if (statret < end)
1657					zero_user_segment(page, statret, end);
1658				ret = copy_page_to_iter(page,
1659						iocb->ki_pos & ~PAGE_MASK,
1660						end - iocb->ki_pos, to);
1661				iocb->ki_pos += ret;
1662				read += ret;
1663			}
1664			if (iocb->ki_pos < i_size && read < len) {
1665				size_t zlen = min_t(size_t, len - read,
1666						    i_size - iocb->ki_pos);
1667				ret = iov_iter_zero(zlen, to);
1668				iocb->ki_pos += ret;
1669				read += ret;
1670			}
1671			__free_pages(page, 0);
1672			return read;
1673		}
1674
1675		/* hit EOF or hole? */
1676		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
1677		    ret < len) {
1678			dout("sync_read hit hole, ppos %lld < size %lld"
1679			     ", reading more\n", iocb->ki_pos, i_size);
1680
1681			read += ret;
1682			len -= ret;
1683			retry_op = HAVE_RETRIED;
1684			goto again;
1685		}
1686	}
1687
1688	if (ret >= 0)
1689		ret += read;
1690
1691	return ret;
1692}
1693
1694/*
1695 * Take cap references to avoid releasing caps to MDS mid-write.
1696 *
1697 * If we are synchronous, and write with an old snap context, the OSD
1698 * may return EOLDSNAPC.  In that case, retry the write.. _after_
1699 * dropping our cap refs and allowing the pending snap to logically
1700 * complete _before_ this write occurs.
1701 *
1702 * If we are near ENOSPC, write synchronously.
1703 */
1704static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
1705{
1706	struct file *file = iocb->ki_filp;
1707	struct ceph_file_info *fi = file->private_data;
1708	struct inode *inode = file_inode(file);
1709	struct ceph_inode_info *ci = ceph_inode(inode);
1710	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1711	struct ceph_osd_client *osdc = &fsc->client->osdc;
1712	struct ceph_cap_flush *prealloc_cf;
1713	ssize_t count, written = 0;
1714	int err, want = 0, got;
1715	bool direct_lock = false;
1716	u32 map_flags;
1717	u64 pool_flags;
1718	loff_t pos;
1719	loff_t limit = max(i_size_read(inode), fsc->max_file_size);
1720
1721	if (ceph_inode_is_shutdown(inode))
1722		return -ESTALE;
1723
1724	if (ceph_snap(inode) != CEPH_NOSNAP)
1725		return -EROFS;
1726
1727	prealloc_cf = ceph_alloc_cap_flush();
1728	if (!prealloc_cf)
1729		return -ENOMEM;
1730
1731	if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
1732		direct_lock = true;
1733
1734retry_snap:
1735	if (direct_lock)
1736		ceph_start_io_direct(inode);
1737	else
1738		ceph_start_io_write(inode);
1739
1740	/* We can write back this queue in page reclaim */
1741	current->backing_dev_info = inode_to_bdi(inode);
1742
1743	if (iocb->ki_flags & IOCB_APPEND) {
1744		err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
1745		if (err < 0)
1746			goto out;
1747	}
1748
1749	err = generic_write_checks(iocb, from);
1750	if (err <= 0)
1751		goto out;
1752
1753	pos = iocb->ki_pos;
1754	if (unlikely(pos >= limit)) {
1755		err = -EFBIG;
1756		goto out;
1757	} else {
1758		iov_iter_truncate(from, limit - pos);
1759	}
1760
1761	count = iov_iter_count(from);
1762	if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
1763		err = -EDQUOT;
1764		goto out;
1765	}
1766
1767	down_read(&osdc->lock);
1768	map_flags = osdc->osdmap->flags;
1769	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
1770	up_read(&osdc->lock);
1771	if ((map_flags & CEPH_OSDMAP_FULL) ||
1772	    (pool_flags & CEPH_POOL_FLAG_FULL)) {
1773		err = -ENOSPC;
1774		goto out;
1775	}
1776
1777	err = file_remove_privs(file);
1778	if (err)
1779		goto out;
1780
1781	if (ci->i_inline_version != CEPH_INLINE_NONE) {
1782		err = ceph_uninline_data(file, NULL);
1783		if (err < 0)
1784			goto out;
1785	}
1786
1787	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
1788	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
1789	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
1790		want |= CEPH_CAP_FILE_BUFFER;
1791	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1792		want |= CEPH_CAP_FILE_LAZYIO;
1793	got = 0;
1794	err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
1795	if (err < 0)
1796		goto out;
1797
1798	err = file_update_time(file);
1799	if (err)
1800		goto out_caps;
1801
1802	inode_inc_iversion_raw(inode);
1803
1804	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
1805	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
1806
1807	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
1808	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
1809	    (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
1810		struct ceph_snap_context *snapc;
1811		struct iov_iter data;
1812
1813		spin_lock(&ci->i_ceph_lock);
1814		if (__ceph_have_pending_cap_snap(ci)) {
1815			struct ceph_cap_snap *capsnap =
1816					list_last_entry(&ci->i_cap_snaps,
1817							struct ceph_cap_snap,
1818							ci_item);
1819			snapc = ceph_get_snap_context(capsnap->context);
1820		} else {
1821			BUG_ON(!ci->i_head_snapc);
1822			snapc = ceph_get_snap_context(ci->i_head_snapc);
1823		}
1824		spin_unlock(&ci->i_ceph_lock);
1825
1826		/* we might need to revert back to that point */
1827		data = *from;
1828		if (iocb->ki_flags & IOCB_DIRECT)
1829			written = ceph_direct_read_write(iocb, &data, snapc,
1830							 &prealloc_cf);
1831		else
1832			written = ceph_sync_write(iocb, &data, pos, snapc);
1833		if (direct_lock)
1834			ceph_end_io_direct(inode);
1835		else
1836			ceph_end_io_write(inode);
1837		if (written > 0)
1838			iov_iter_advance(from, written);
1839		ceph_put_snap_context(snapc);
1840	} else {
1841		/*
1842		 * No need to acquire the i_truncate_mutex. Because
1843		 * the MDS revokes Fwb caps before sending truncate
1844		 * message to us. We can't get Fwb cap while there
1845		 * are pending vmtruncate. So write and vmtruncate
1846		 * can not run at the same time
1847		 */
1848		written = generic_perform_write(file, from, pos);
1849		if (likely(written >= 0))
1850			iocb->ki_pos = pos + written;
1851		ceph_end_io_write(inode);
1852	}
1853
1854	if (written >= 0) {
1855		int dirty;
1856
1857		spin_lock(&ci->i_ceph_lock);
1858		ci->i_inline_version = CEPH_INLINE_NONE;
1859		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1860					       &prealloc_cf);
1861		spin_unlock(&ci->i_ceph_lock);
1862		if (dirty)
1863			__mark_inode_dirty(inode, dirty);
1864		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
1865			ceph_check_caps(ci, 0, NULL);
1866	}
1867
1868	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
1869	     inode, ceph_vinop(inode), pos, (unsigned)count,
1870	     ceph_cap_string(got));
1871	ceph_put_cap_refs(ci, got);
1872
1873	if (written == -EOLDSNAPC) {
1874		dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
1875		     inode, ceph_vinop(inode), pos, (unsigned)count);
1876		goto retry_snap;
1877	}
1878
1879	if (written >= 0) {
1880		if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
1881		    (pool_flags & CEPH_POOL_FLAG_NEARFULL))
1882			iocb->ki_flags |= IOCB_DSYNC;
1883		written = generic_write_sync(iocb, written);
1884	}
1885
1886	goto out_unlocked;
1887out_caps:
1888	ceph_put_cap_refs(ci, got);
1889out:
1890	if (direct_lock)
1891		ceph_end_io_direct(inode);
1892	else
1893		ceph_end_io_write(inode);
1894out_unlocked:
1895	ceph_free_cap_flush(prealloc_cf);
1896	current->backing_dev_info = NULL;
1897	return written ? written : err;
1898}
1899
1900/*
1901 * llseek.  be sure to verify file size on SEEK_END.
1902 */
1903static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
1904{
1905	struct inode *inode = file->f_mapping->host;
1906	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1907	loff_t i_size;
1908	loff_t ret;
1909
1910	inode_lock(inode);
1911
1912	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
1913		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
1914		if (ret < 0)
1915			goto out;
1916	}
1917
1918	i_size = i_size_read(inode);
1919	switch (whence) {
1920	case SEEK_END:
1921		offset += i_size;
1922		break;
1923	case SEEK_CUR:
1924		/*
1925		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
1926		 * position-querying operation.  Avoid rewriting the "same"
1927		 * f_pos value back to the file because a concurrent read(),
1928		 * write() or lseek() might have altered it
1929		 */
1930		if (offset == 0) {
1931			ret = file->f_pos;
1932			goto out;
1933		}
1934		offset += file->f_pos;
1935		break;
1936	case SEEK_DATA:
1937		if (offset < 0 || offset >= i_size) {
1938			ret = -ENXIO;
1939			goto out;
1940		}
1941		break;
1942	case SEEK_HOLE:
1943		if (offset < 0 || offset >= i_size) {
1944			ret = -ENXIO;
1945			goto out;
1946		}
1947		offset = i_size;
1948		break;
1949	}
1950
1951	ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
1952
1953out:
1954	inode_unlock(inode);
1955	return ret;
1956}
1957
1958static inline void ceph_zero_partial_page(
1959	struct inode *inode, loff_t offset, unsigned size)
1960{
1961	struct page *page;
1962	pgoff_t index = offset >> PAGE_SHIFT;
1963
1964	page = find_lock_page(inode->i_mapping, index);
1965	if (page) {
1966		wait_on_page_writeback(page);
1967		zero_user(page, offset & (PAGE_SIZE - 1), size);
1968		unlock_page(page);
1969		put_page(page);
1970	}
1971}
1972
1973static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
1974				      loff_t length)
1975{
1976	loff_t nearly = round_up(offset, PAGE_SIZE);
1977	if (offset < nearly) {
1978		loff_t size = nearly - offset;
1979		if (length < size)
1980			size = length;
1981		ceph_zero_partial_page(inode, offset, size);
1982		offset += size;
1983		length -= size;
1984	}
1985	if (length >= PAGE_SIZE) {
1986		loff_t size = round_down(length, PAGE_SIZE);
1987		truncate_pagecache_range(inode, offset, offset + size - 1);
1988		offset += size;
1989		length -= size;
1990	}
1991	if (length)
1992		ceph_zero_partial_page(inode, offset, length);
1993}
1994
1995static int ceph_zero_partial_object(struct inode *inode,
1996				    loff_t offset, loff_t *length)
1997{
1998	struct ceph_inode_info *ci = ceph_inode(inode);
1999	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2000	struct ceph_osd_request *req;
2001	int ret = 0;
2002	loff_t zero = 0;
2003	int op;
2004
2005	if (!length) {
2006		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
2007		length = &zero;
2008	} else {
2009		op = CEPH_OSD_OP_ZERO;
2010	}
2011
2012	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
2013					ceph_vino(inode),
2014					offset, length,
2015					0, 1, op,
2016					CEPH_OSD_FLAG_WRITE,
2017					NULL, 0, 0, false);
2018	if (IS_ERR(req)) {
2019		ret = PTR_ERR(req);
2020		goto out;
2021	}
2022
2023	req->r_mtime = inode->i_mtime;
2024	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
2025	if (!ret) {
2026		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
2027		if (ret == -ENOENT)
2028			ret = 0;
2029	}
2030	ceph_osdc_put_request(req);
2031
2032out:
2033	return ret;
2034}
2035
2036static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
2037{
2038	int ret = 0;
2039	struct ceph_inode_info *ci = ceph_inode(inode);
2040	s32 stripe_unit = ci->i_layout.stripe_unit;
2041	s32 stripe_count = ci->i_layout.stripe_count;
2042	s32 object_size = ci->i_layout.object_size;
2043	u64 object_set_size = object_size * stripe_count;
2044	u64 nearly, t;
2045
2046	/* round offset up to next period boundary */
2047	nearly = offset + object_set_size - 1;
2048	t = nearly;
2049	nearly -= do_div(t, object_set_size);
2050
2051	while (length && offset < nearly) {
2052		loff_t size = length;
2053		ret = ceph_zero_partial_object(inode, offset, &size);
2054		if (ret < 0)
2055			return ret;
2056		offset += size;
2057		length -= size;
2058	}
2059	while (length >= object_set_size) {
2060		int i;
2061		loff_t pos = offset;
2062		for (i = 0; i < stripe_count; ++i) {
2063			ret = ceph_zero_partial_object(inode, pos, NULL);
2064			if (ret < 0)
2065				return ret;
2066			pos += stripe_unit;
2067		}
2068		offset += object_set_size;
2069		length -= object_set_size;
2070	}
2071	while (length) {
2072		loff_t size = length;
2073		ret = ceph_zero_partial_object(inode, offset, &size);
2074		if (ret < 0)
2075			return ret;
2076		offset += size;
2077		length -= size;
2078	}
2079	return ret;
2080}
2081
2082static long ceph_fallocate(struct file *file, int mode,
2083				loff_t offset, loff_t length)
2084{
2085	struct ceph_file_info *fi = file->private_data;
2086	struct inode *inode = file_inode(file);
2087	struct ceph_inode_info *ci = ceph_inode(inode);
2088	struct ceph_cap_flush *prealloc_cf;
2089	int want, got = 0;
2090	int dirty;
2091	int ret = 0;
2092	loff_t endoff = 0;
2093	loff_t size;
2094
2095	if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2096		return -EOPNOTSUPP;
2097
2098	if (!S_ISREG(inode->i_mode))
2099		return -EOPNOTSUPP;
2100
2101	prealloc_cf = ceph_alloc_cap_flush();
2102	if (!prealloc_cf)
2103		return -ENOMEM;
2104
2105	inode_lock(inode);
2106
2107	if (ceph_snap(inode) != CEPH_NOSNAP) {
2108		ret = -EROFS;
2109		goto unlock;
2110	}
2111
2112	if (ci->i_inline_version != CEPH_INLINE_NONE) {
2113		ret = ceph_uninline_data(file, NULL);
2114		if (ret < 0)
2115			goto unlock;
2116	}
2117
2118	size = i_size_read(inode);
2119
2120	/* Are we punching a hole beyond EOF? */
2121	if (offset >= size)
2122		goto unlock;
2123	if ((offset + length) > size)
2124		length = size - offset;
2125
2126	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2127		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
2128	else
2129		want = CEPH_CAP_FILE_BUFFER;
2130
2131	ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
2132	if (ret < 0)
2133		goto unlock;
2134
2135	filemap_invalidate_lock(inode->i_mapping);
2136	ceph_fscache_invalidate(inode, false);
2137	ceph_zero_pagecache_range(inode, offset, length);
2138	ret = ceph_zero_objects(inode, offset, length);
2139
2140	if (!ret) {
2141		spin_lock(&ci->i_ceph_lock);
2142		ci->i_inline_version = CEPH_INLINE_NONE;
2143		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
2144					       &prealloc_cf);
2145		spin_unlock(&ci->i_ceph_lock);
2146		if (dirty)
2147			__mark_inode_dirty(inode, dirty);
2148	}
2149	filemap_invalidate_unlock(inode->i_mapping);
2150
2151	ceph_put_cap_refs(ci, got);
2152unlock:
2153	inode_unlock(inode);
2154	ceph_free_cap_flush(prealloc_cf);
2155	return ret;
2156}
2157
2158/*
2159 * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
2160 * src_ci.  Two attempts are made to obtain both caps, and an error is return if
2161 * this fails; zero is returned on success.
2162 */
2163static int get_rd_wr_caps(struct file *src_filp, int *src_got,
2164			  struct file *dst_filp,
2165			  loff_t dst_endoff, int *dst_got)
2166{
2167	int ret = 0;
2168	bool retrying = false;
2169
2170retry_caps:
2171	ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
2172			    dst_endoff, dst_got);
2173	if (ret < 0)
2174		return ret;
2175
2176	/*
2177	 * Since we're already holding the FILE_WR capability for the dst file,
2178	 * we would risk a deadlock by using ceph_get_caps.  Thus, we'll do some
2179	 * retry dance instead to try to get both capabilities.
2180	 */
2181	ret = ceph_try_get_caps(file_inode(src_filp),
2182				CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
2183				false, src_got);
2184	if (ret <= 0) {
2185		/* Start by dropping dst_ci caps and getting src_ci caps */
2186		ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
2187		if (retrying) {
2188			if (!ret)
2189				/* ceph_try_get_caps masks EAGAIN */
2190				ret = -EAGAIN;
2191			return ret;
2192		}
2193		ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
2194				    CEPH_CAP_FILE_SHARED, -1, src_got);
2195		if (ret < 0)
2196			return ret;
2197		/*... drop src_ci caps too, and retry */
2198		ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
2199		retrying = true;
2200		goto retry_caps;
2201	}
2202	return ret;
2203}
2204
2205static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
2206			   struct ceph_inode_info *dst_ci, int dst_got)
2207{
2208	ceph_put_cap_refs(src_ci, src_got);
2209	ceph_put_cap_refs(dst_ci, dst_got);
2210}
2211
2212/*
2213 * This function does several size-related checks, returning an error if:
2214 *  - source file is smaller than off+len
2215 *  - destination file size is not OK (inode_newsize_ok())
2216 *  - max bytes quotas is exceeded
2217 */
2218static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
2219			   loff_t src_off, loff_t dst_off, size_t len)
2220{
2221	loff_t size, endoff;
2222
2223	size = i_size_read(src_inode);
2224	/*
2225	 * Don't copy beyond source file EOF.  Instead of simply setting length
2226	 * to (size - src_off), just drop to VFS default implementation, as the
2227	 * local i_size may be stale due to other clients writing to the source
2228	 * inode.
2229	 */
2230	if (src_off + len > size) {
2231		dout("Copy beyond EOF (%llu + %zu > %llu)\n",
2232		     src_off, len, size);
2233		return -EOPNOTSUPP;
2234	}
2235	size = i_size_read(dst_inode);
2236
2237	endoff = dst_off + len;
2238	if (inode_newsize_ok(dst_inode, endoff))
2239		return -EOPNOTSUPP;
2240
2241	if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
2242		return -EDQUOT;
2243
2244	return 0;
2245}
2246
2247static struct ceph_osd_request *
2248ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
2249			    u64 src_snapid,
2250			    struct ceph_object_id *src_oid,
2251			    struct ceph_object_locator *src_oloc,
2252			    struct ceph_object_id *dst_oid,
2253			    struct ceph_object_locator *dst_oloc,
2254			    u32 truncate_seq, u64 truncate_size)
2255{
2256	struct ceph_osd_request *req;
2257	int ret;
2258	u32 src_fadvise_flags =
2259		CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
2260		CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
2261	u32 dst_fadvise_flags =
2262		CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
2263		CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
2264
2265	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
2266	if (!req)
2267		return ERR_PTR(-ENOMEM);
2268
2269	req->r_flags = CEPH_OSD_FLAG_WRITE;
2270
2271	ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
2272	ceph_oid_copy(&req->r_t.base_oid, dst_oid);
2273
2274	ret = osd_req_op_copy_from_init(req, src_snapid, 0,
2275					src_oid, src_oloc,
2276					src_fadvise_flags,
2277					dst_fadvise_flags,
2278					truncate_seq,
2279					truncate_size,
2280					CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
2281	if (ret)
2282		goto out;
2283
2284	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
2285	if (ret)
2286		goto out;
2287
2288	return req;
2289
2290out:
2291	ceph_osdc_put_request(req);
2292	return ERR_PTR(ret);
2293}
2294
2295static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
2296				    struct ceph_inode_info *dst_ci, u64 *dst_off,
2297				    struct ceph_fs_client *fsc,
2298				    size_t len, unsigned int flags)
2299{
2300	struct ceph_object_locator src_oloc, dst_oloc;
2301	struct ceph_object_id src_oid, dst_oid;
2302	struct ceph_osd_client *osdc;
2303	struct ceph_osd_request *req;
2304	size_t bytes = 0;
2305	u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
2306	u32 src_objlen, dst_objlen;
2307	u32 object_size = src_ci->i_layout.object_size;
2308	int ret;
2309
2310	src_oloc.pool = src_ci->i_layout.pool_id;
2311	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
2312	dst_oloc.pool = dst_ci->i_layout.pool_id;
2313	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
2314	osdc = &fsc->client->osdc;
2315
2316	while (len >= object_size) {
2317		ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
2318					      object_size, &src_objnum,
2319					      &src_objoff, &src_objlen);
2320		ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
2321					      object_size, &dst_objnum,
2322					      &dst_objoff, &dst_objlen);
2323		ceph_oid_init(&src_oid);
2324		ceph_oid_printf(&src_oid, "%llx.%08llx",
2325				src_ci->i_vino.ino, src_objnum);
2326		ceph_oid_init(&dst_oid);
2327		ceph_oid_printf(&dst_oid, "%llx.%08llx",
2328				dst_ci->i_vino.ino, dst_objnum);
2329		/* Do an object remote copy */
2330		req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
2331						  &src_oid, &src_oloc,
2332						  &dst_oid, &dst_oloc,
2333						  dst_ci->i_truncate_seq,
2334						  dst_ci->i_truncate_size);
2335		if (IS_ERR(req))
2336			ret = PTR_ERR(req);
2337		else {
2338			ceph_osdc_start_request(osdc, req, false);
2339			ret = ceph_osdc_wait_request(osdc, req);
2340			ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
2341						     req->r_start_latency,
2342						     req->r_end_latency,
2343						     object_size, ret);
2344			ceph_osdc_put_request(req);
2345		}
2346		if (ret) {
2347			if (ret == -EOPNOTSUPP) {
2348				fsc->have_copy_from2 = false;
2349				pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
2350			}
2351			dout("ceph_osdc_copy_from returned %d\n", ret);
2352			if (!bytes)
2353				bytes = ret;
2354			goto out;
2355		}
2356		len -= object_size;
2357		bytes += object_size;
2358		*src_off += object_size;
2359		*dst_off += object_size;
2360	}
2361
2362out:
2363	ceph_oloc_destroy(&src_oloc);
2364	ceph_oloc_destroy(&dst_oloc);
2365	return bytes;
2366}
2367
2368static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2369				      struct file *dst_file, loff_t dst_off,
2370				      size_t len, unsigned int flags)
2371{
2372	struct inode *src_inode = file_inode(src_file);
2373	struct inode *dst_inode = file_inode(dst_file);
2374	struct ceph_inode_info *src_ci = ceph_inode(src_inode);
2375	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
2376	struct ceph_cap_flush *prealloc_cf;
2377	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
2378	loff_t size;
2379	ssize_t ret = -EIO, bytes;
2380	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
2381	u32 src_objlen, dst_objlen;
2382	int src_got = 0, dst_got = 0, err, dirty;
2383
2384	if (src_inode->i_sb != dst_inode->i_sb) {
2385		struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
2386
2387		if (ceph_fsid_compare(&src_fsc->client->fsid,
2388				      &dst_fsc->client->fsid)) {
2389			dout("Copying files across clusters: src: %pU dst: %pU\n",
2390			     &src_fsc->client->fsid, &dst_fsc->client->fsid);
2391			return -EXDEV;
2392		}
2393	}
2394	if (ceph_snap(dst_inode) != CEPH_NOSNAP)
2395		return -EROFS;
2396
2397	/*
2398	 * Some of the checks below will return -EOPNOTSUPP, which will force a
2399	 * fallback to the default VFS copy_file_range implementation.  This is
2400	 * desirable in several cases (for ex, the 'len' is smaller than the
2401	 * size of the objects, or in cases where that would be more
2402	 * efficient).
2403	 */
2404
2405	if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
2406		return -EOPNOTSUPP;
2407
2408	if (!src_fsc->have_copy_from2)
2409		return -EOPNOTSUPP;
2410
2411	/*
2412	 * Striped file layouts require that we copy partial objects, but the
2413	 * OSD copy-from operation only supports full-object copies.  Limit
2414	 * this to non-striped file layouts for now.
2415	 */
2416	if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
2417	    (src_ci->i_layout.stripe_count != 1) ||
2418	    (dst_ci->i_layout.stripe_count != 1) ||
2419	    (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
2420		dout("Invalid src/dst files layout\n");
2421		return -EOPNOTSUPP;
2422	}
2423
2424	if (len < src_ci->i_layout.object_size)
2425		return -EOPNOTSUPP; /* no remote copy will be done */
2426
2427	prealloc_cf = ceph_alloc_cap_flush();
2428	if (!prealloc_cf)
2429		return -ENOMEM;
2430
2431	/* Start by sync'ing the source and destination files */
2432	ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
2433	if (ret < 0) {
2434		dout("failed to write src file (%zd)\n", ret);
2435		goto out;
2436	}
2437	ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
2438	if (ret < 0) {
2439		dout("failed to write dst file (%zd)\n", ret);
2440		goto out;
2441	}
2442
2443	/*
2444	 * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
2445	 * clients may have dirty data in their caches.  And OSDs know nothing
2446	 * about caps, so they can't safely do the remote object copies.
2447	 */
2448	err = get_rd_wr_caps(src_file, &src_got,
2449			     dst_file, (dst_off + len), &dst_got);
2450	if (err < 0) {
2451		dout("get_rd_wr_caps returned %d\n", err);
2452		ret = -EOPNOTSUPP;
2453		goto out;
2454	}
2455
2456	ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
2457	if (ret < 0)
2458		goto out_caps;
2459
2460	/* Drop dst file cached pages */
2461	ceph_fscache_invalidate(dst_inode, false);
2462	ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
2463					    dst_off >> PAGE_SHIFT,
2464					    (dst_off + len) >> PAGE_SHIFT);
2465	if (ret < 0) {
2466		dout("Failed to invalidate inode pages (%zd)\n", ret);
2467		ret = 0; /* XXX */
2468	}
2469	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
2470				      src_ci->i_layout.object_size,
2471				      &src_objnum, &src_objoff, &src_objlen);
2472	ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
2473				      dst_ci->i_layout.object_size,
2474				      &dst_objnum, &dst_objoff, &dst_objlen);
2475	/* object-level offsets need to the same */
2476	if (src_objoff != dst_objoff) {
2477		ret = -EOPNOTSUPP;
2478		goto out_caps;
2479	}
2480
2481	/*
2482	 * Do a manual copy if the object offset isn't object aligned.
2483	 * 'src_objlen' contains the bytes left until the end of the object,
2484	 * starting at the src_off
2485	 */
2486	if (src_objoff) {
2487		dout("Initial partial copy of %u bytes\n", src_objlen);
2488
2489		/*
2490		 * we need to temporarily drop all caps as we'll be calling
2491		 * {read,write}_iter, which will get caps again.
2492		 */
2493		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
2494		ret = do_splice_direct(src_file, &src_off, dst_file,
2495				       &dst_off, src_objlen, flags);
2496		/* Abort on short copies or on error */
2497		if (ret < src_objlen) {
2498			dout("Failed partial copy (%zd)\n", ret);
2499			goto out;
2500		}
2501		len -= ret;
2502		err = get_rd_wr_caps(src_file, &src_got,
2503				     dst_file, (dst_off + len), &dst_got);
2504		if (err < 0)
2505			goto out;
2506		err = is_file_size_ok(src_inode, dst_inode,
2507				      src_off, dst_off, len);
2508		if (err < 0)
2509			goto out_caps;
2510	}
2511
2512	size = i_size_read(dst_inode);
2513	bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
2514				     src_fsc, len, flags);
2515	if (bytes <= 0) {
2516		if (!ret)
2517			ret = bytes;
2518		goto out_caps;
2519	}
2520	dout("Copied %zu bytes out of %zu\n", bytes, len);
2521	len -= bytes;
2522	ret += bytes;
2523
2524	file_update_time(dst_file);
2525	inode_inc_iversion_raw(dst_inode);
2526
2527	if (dst_off > size) {
2528		/* Let the MDS know about dst file size change */
2529		if (ceph_inode_set_size(dst_inode, dst_off) ||
2530		    ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
2531			ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
2532	}
2533	/* Mark Fw dirty */
2534	spin_lock(&dst_ci->i_ceph_lock);
2535	dst_ci->i_inline_version = CEPH_INLINE_NONE;
2536	dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
2537	spin_unlock(&dst_ci->i_ceph_lock);
2538	if (dirty)
2539		__mark_inode_dirty(dst_inode, dirty);
2540
2541out_caps:
2542	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
2543
2544	/*
2545	 * Do the final manual copy if we still have some bytes left, unless
2546	 * there were errors in remote object copies (len >= object_size).
2547	 */
2548	if (len && (len < src_ci->i_layout.object_size)) {
2549		dout("Final partial copy of %zu bytes\n", len);
2550		bytes = do_splice_direct(src_file, &src_off, dst_file,
2551					 &dst_off, len, flags);
2552		if (bytes > 0)
2553			ret += bytes;
2554		else
2555			dout("Failed partial copy (%zd)\n", bytes);
2556	}
2557
2558out:
2559	ceph_free_cap_flush(prealloc_cf);
2560
2561	return ret;
2562}
2563
2564static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
2565				    struct file *dst_file, loff_t dst_off,
2566				    size_t len, unsigned int flags)
2567{
2568	ssize_t ret;
2569
2570	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
2571				     len, flags);
2572
2573	if (ret == -EOPNOTSUPP || ret == -EXDEV)
2574		ret = generic_copy_file_range(src_file, src_off, dst_file,
2575					      dst_off, len, flags);
2576	return ret;
2577}
2578
2579const struct file_operations ceph_file_fops = {
2580	.open = ceph_open,
2581	.release = ceph_release,
2582	.llseek = ceph_llseek,
2583	.read_iter = ceph_read_iter,
2584	.write_iter = ceph_write_iter,
2585	.mmap = ceph_mmap,
2586	.fsync = ceph_fsync,
2587	.lock = ceph_lock,
2588	.setlease = simple_nosetlease,
2589	.flock = ceph_flock,
2590	.splice_read = generic_file_splice_read,
2591	.splice_write = iter_file_splice_write,
2592	.unlocked_ioctl = ceph_ioctl,
2593	.compat_ioctl = compat_ptr_ioctl,
2594	.fallocate	= ceph_fallocate,
2595	.copy_file_range = ceph_copy_file_range,
2596};
Configure Feed

Configure Feed