fs/ceph/addr.c at v6.7-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ceph / addr.c
at v6.7-rc2 2256 lines 62 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/ceph/ceph_debug.h>
   3
   4#include <linux/backing-dev.h>
   5#include <linux/fs.h>
   6#include <linux/mm.h>
   7#include <linux/swap.h>
   8#include <linux/pagemap.h>
   9#include <linux/slab.h>
  10#include <linux/pagevec.h>
  11#include <linux/task_io_accounting_ops.h>
  12#include <linux/signal.h>
  13#include <linux/iversion.h>
  14#include <linux/ktime.h>
  15#include <linux/netfs.h>
  16
  17#include "super.h"
  18#include "mds_client.h"
  19#include "cache.h"
  20#include "metric.h"
  21#include "crypto.h"
  22#include <linux/ceph/osd_client.h>
  23#include <linux/ceph/striper.h>
  24
  25/*
  26 * Ceph address space ops.
  27 *
  28 * There are a few funny things going on here.
  29 *
  30 * The page->private field is used to reference a struct
  31 * ceph_snap_context for _every_ dirty page.  This indicates which
  32 * snapshot the page was logically dirtied in, and thus which snap
  33 * context needs to be associated with the osd write during writeback.
  34 *
  35 * Similarly, struct ceph_inode_info maintains a set of counters to
  36 * count dirty pages on the inode.  In the absence of snapshots,
  37 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
  38 *
  39 * When a snapshot is taken (that is, when the client receives
  40 * notification that a snapshot was taken), each inode with caps and
  41 * with dirty pages (dirty pages implies there is a cap) gets a new
  42 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
  43 * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
  44 * moved to capsnap->dirty. (Unless a sync write is currently in
  45 * progress.  In that case, the capsnap is said to be "pending", new
  46 * writes cannot start, and the capsnap isn't "finalized" until the
  47 * write completes (or fails) and a final size/mtime for the inode for
  48 * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
  49 *
  50 * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
  51 * we look for the first capsnap in i_cap_snaps and write out pages in
  52 * that snap context _only_.  Then we move on to the next capsnap,
  53 * eventually reaching the "live" or "head" context (i.e., pages that
  54 * are not yet snapped) and are writing the most recently dirtied
  55 * pages.
  56 *
  57 * Invalidate and so forth must take care to ensure the dirty page
  58 * accounting is preserved.
  59 */
  60
  61#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
  62#define CONGESTION_OFF_THRESH(congestion_kb)				\
  63	(CONGESTION_ON_THRESH(congestion_kb) -				\
  64	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
  65
  66static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
  67					struct folio **foliop, void **_fsdata);
  68
  69static inline struct ceph_snap_context *page_snap_context(struct page *page)
  70{
  71	if (PagePrivate(page))
  72		return (void *)page->private;
  73	return NULL;
  74}
  75
  76/*
  77 * Dirty a page.  Optimistically adjust accounting, on the assumption
  78 * that we won't race with invalidate.  If we do, readjust.
  79 */
  80static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
  81{
  82	struct inode *inode = mapping->host;
  83	struct ceph_client *cl = ceph_inode_to_client(inode);
  84	struct ceph_inode_info *ci;
  85	struct ceph_snap_context *snapc;
  86
  87	if (folio_test_dirty(folio)) {
  88		doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
  89		      ceph_vinop(inode), folio, folio->index);
  90		VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
  91		return false;
  92	}
  93
  94	ci = ceph_inode(inode);
  95
  96	/* dirty the head */
  97	spin_lock(&ci->i_ceph_lock);
  98	BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
  99	if (__ceph_have_pending_cap_snap(ci)) {
 100		struct ceph_cap_snap *capsnap =
 101				list_last_entry(&ci->i_cap_snaps,
 102						struct ceph_cap_snap,
 103						ci_item);
 104		snapc = ceph_get_snap_context(capsnap->context);
 105		capsnap->dirty_pages++;
 106	} else {
 107		BUG_ON(!ci->i_head_snapc);
 108		snapc = ceph_get_snap_context(ci->i_head_snapc);
 109		++ci->i_wrbuffer_ref_head;
 110	}
 111	if (ci->i_wrbuffer_ref == 0)
 112		ihold(inode);
 113	++ci->i_wrbuffer_ref;
 114	doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
 115	      "snapc %p seq %lld (%d snaps)\n",
 116	      ceph_vinop(inode), folio, folio->index,
 117	      ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
 118	      ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
 119	      snapc, snapc->seq, snapc->num_snaps);
 120	spin_unlock(&ci->i_ceph_lock);
 121
 122	/*
 123	 * Reference snap context in folio->private.  Also set
 124	 * PagePrivate so that we get invalidate_folio callback.
 125	 */
 126	VM_WARN_ON_FOLIO(folio->private, folio);
 127	folio_attach_private(folio, snapc);
 128
 129	return ceph_fscache_dirty_folio(mapping, folio);
 130}
 131
 132/*
 133 * If we are truncating the full folio (i.e. offset == 0), adjust the
 134 * dirty folio counters appropriately.  Only called if there is private
 135 * data on the folio.
 136 */
 137static void ceph_invalidate_folio(struct folio *folio, size_t offset,
 138				size_t length)
 139{
 140	struct inode *inode = folio->mapping->host;
 141	struct ceph_client *cl = ceph_inode_to_client(inode);
 142	struct ceph_inode_info *ci = ceph_inode(inode);
 143	struct ceph_snap_context *snapc;
 144
 145
 146	if (offset != 0 || length != folio_size(folio)) {
 147		doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
 148		      ceph_vinop(inode), folio->index, offset, length);
 149		return;
 150	}
 151
 152	WARN_ON(!folio_test_locked(folio));
 153	if (folio_test_private(folio)) {
 154		doutc(cl, "%llx.%llx idx %lu full dirty page\n",
 155		      ceph_vinop(inode), folio->index);
 156
 157		snapc = folio_detach_private(folio);
 158		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 159		ceph_put_snap_context(snapc);
 160	}
 161
 162	folio_wait_fscache(folio);
 163}
 164
 165static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
 166{
 167	struct inode *inode = folio->mapping->host;
 168	struct ceph_client *cl = ceph_inode_to_client(inode);
 169
 170	doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
 171	      folio->index, folio_test_dirty(folio) ? "" : "not ");
 172
 173	if (folio_test_private(folio))
 174		return false;
 175
 176	if (folio_test_fscache(folio)) {
 177		if (current_is_kswapd() || !(gfp & __GFP_FS))
 178			return false;
 179		folio_wait_fscache(folio);
 180	}
 181	ceph_fscache_note_page_release(inode);
 182	return true;
 183}
 184
 185static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
 186{
 187	struct inode *inode = rreq->inode;
 188	struct ceph_inode_info *ci = ceph_inode(inode);
 189	struct ceph_file_layout *lo = &ci->i_layout;
 190	unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
 191	loff_t end = rreq->start + rreq->len, new_end;
 192	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
 193	unsigned long max_len;
 194	u32 blockoff;
 195
 196	if (priv) {
 197		/* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */
 198		if (priv->file_ra_disabled)
 199			max_pages = 0;
 200		else
 201			max_pages = priv->file_ra_pages;
 202
 203	}
 204
 205	/* Readahead is disabled */
 206	if (!max_pages)
 207		return;
 208
 209	max_len = max_pages << PAGE_SHIFT;
 210
 211	/*
 212	 * Try to expand the length forward by rounding up it to the next
 213	 * block, but do not exceed the file size, unless the original
 214	 * request already exceeds it.
 215	 */
 216	new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
 217	if (new_end > end && new_end <= rreq->start + max_len)
 218		rreq->len = new_end - rreq->start;
 219
 220	/* Try to expand the start downward */
 221	div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
 222	if (rreq->len + blockoff <= max_len) {
 223		rreq->start -= blockoff;
 224		rreq->len += blockoff;
 225	}
 226}
 227
 228static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
 229{
 230	struct inode *inode = subreq->rreq->inode;
 231	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 232	struct ceph_inode_info *ci = ceph_inode(inode);
 233	u64 objno, objoff;
 234	u32 xlen;
 235
 236	/* Truncate the extent at the end of the current block */
 237	ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
 238				      &objno, &objoff, &xlen);
 239	subreq->len = min(xlen, fsc->mount_options->rsize);
 240	return true;
 241}
 242
 243static void finish_netfs_read(struct ceph_osd_request *req)
 244{
 245	struct inode *inode = req->r_inode;
 246	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 247	struct ceph_client *cl = fsc->client;
 248	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
 249	struct netfs_io_subrequest *subreq = req->r_priv;
 250	struct ceph_osd_req_op *op = &req->r_ops[0];
 251	int err = req->r_result;
 252	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
 253
 254	ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
 255				 req->r_end_latency, osd_data->length, err);
 256
 257	doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
 258	      subreq->len, i_size_read(req->r_inode));
 259
 260	/* no object means success but no data */
 261	if (err == -ENOENT)
 262		err = 0;
 263	else if (err == -EBLOCKLISTED)
 264		fsc->blocklisted = true;
 265
 266	if (err >= 0) {
 267		if (sparse && err > 0)
 268			err = ceph_sparse_ext_map_end(op);
 269		if (err < subreq->len)
 270			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 271		if (IS_ENCRYPTED(inode) && err > 0) {
 272			err = ceph_fscrypt_decrypt_extents(inode,
 273					osd_data->pages, subreq->start,
 274					op->extent.sparse_ext,
 275					op->extent.sparse_ext_cnt);
 276			if (err > subreq->len)
 277				err = subreq->len;
 278		}
 279	}
 280
 281	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 282		ceph_put_page_vector(osd_data->pages,
 283				     calc_pages_for(osd_data->alignment,
 284					osd_data->length), false);
 285	}
 286	netfs_subreq_terminated(subreq, err, false);
 287	iput(req->r_inode);
 288	ceph_dec_osd_stopping_blocker(fsc->mdsc);
 289}
 290
 291static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
 292{
 293	struct netfs_io_request *rreq = subreq->rreq;
 294	struct inode *inode = rreq->inode;
 295	struct ceph_mds_reply_info_parsed *rinfo;
 296	struct ceph_mds_reply_info_in *iinfo;
 297	struct ceph_mds_request *req;
 298	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 299	struct ceph_inode_info *ci = ceph_inode(inode);
 300	struct iov_iter iter;
 301	ssize_t err = 0;
 302	size_t len;
 303	int mode;
 304
 305	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 306	__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 307
 308	if (subreq->start >= inode->i_size)
 309		goto out;
 310
 311	/* We need to fetch the inline data. */
 312	mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
 313	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
 314	if (IS_ERR(req)) {
 315		err = PTR_ERR(req);
 316		goto out;
 317	}
 318	req->r_ino1 = ci->i_vino;
 319	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
 320	req->r_num_caps = 2;
 321
 322	err = ceph_mdsc_do_request(mdsc, NULL, req);
 323	if (err < 0)
 324		goto out;
 325
 326	rinfo = &req->r_reply_info;
 327	iinfo = &rinfo->targeti;
 328	if (iinfo->inline_version == CEPH_INLINE_NONE) {
 329		/* The data got uninlined */
 330		ceph_mdsc_put_request(req);
 331		return false;
 332	}
 333
 334	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
 335	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
 336	err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
 337	if (err == 0)
 338		err = -EFAULT;
 339
 340	ceph_mdsc_put_request(req);
 341out:
 342	netfs_subreq_terminated(subreq, err, false);
 343	return true;
 344}
 345
 346static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 347{
 348	struct netfs_io_request *rreq = subreq->rreq;
 349	struct inode *inode = rreq->inode;
 350	struct ceph_inode_info *ci = ceph_inode(inode);
 351	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 352	struct ceph_client *cl = fsc->client;
 353	struct ceph_osd_request *req = NULL;
 354	struct ceph_vino vino = ceph_vino(inode);
 355	struct iov_iter iter;
 356	int err = 0;
 357	u64 len = subreq->len;
 358	bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
 359	u64 off = subreq->start;
 360
 361	if (ceph_inode_is_shutdown(inode)) {
 362		err = -EIO;
 363		goto out;
 364	}
 365
 366	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
 367		return;
 368
 369	ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
 370
 371	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
 372			off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
 373			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
 374			NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
 375	if (IS_ERR(req)) {
 376		err = PTR_ERR(req);
 377		req = NULL;
 378		goto out;
 379	}
 380
 381	if (sparse) {
 382		err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
 383		if (err)
 384			goto out;
 385	}
 386
 387	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
 388	      ceph_vinop(inode), subreq->start, subreq->len, len);
 389
 390	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
 391
 392	/*
 393	 * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
 394	 * encrypted inodes. We'd need infrastructure that handles an iov_iter
 395	 * instead of page arrays, and we don't have that as of yet. Once the
 396	 * dust settles on the write helpers and encrypt/decrypt routines for
 397	 * netfs, we should be able to rework this.
 398	 */
 399	if (IS_ENCRYPTED(inode)) {
 400		struct page **pages;
 401		size_t page_off;
 402
 403		err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
 404		if (err < 0) {
 405			doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
 406			      ceph_vinop(inode), err);
 407			goto out;
 408		}
 409
 410		/* should always give us a page-aligned read */
 411		WARN_ON_ONCE(page_off);
 412		len = err;
 413		err = 0;
 414
 415		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
 416						 false);
 417	} else {
 418		osd_req_op_extent_osd_iter(req, 0, &iter);
 419	}
 420	if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
 421		err = -EIO;
 422		goto out;
 423	}
 424	req->r_callback = finish_netfs_read;
 425	req->r_priv = subreq;
 426	req->r_inode = inode;
 427	ihold(inode);
 428
 429	ceph_osdc_start_request(req->r_osdc, req);
 430out:
 431	ceph_osdc_put_request(req);
 432	if (err)
 433		netfs_subreq_terminated(subreq, err, false);
 434	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
 435}
 436
 437static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
 438{
 439	struct inode *inode = rreq->inode;
 440	struct ceph_client *cl = ceph_inode_to_client(inode);
 441	int got = 0, want = CEPH_CAP_FILE_CACHE;
 442	struct ceph_netfs_request_data *priv;
 443	int ret = 0;
 444
 445	if (rreq->origin != NETFS_READAHEAD)
 446		return 0;
 447
 448	priv = kzalloc(sizeof(*priv), GFP_NOFS);
 449	if (!priv)
 450		return -ENOMEM;
 451
 452	if (file) {
 453		struct ceph_rw_context *rw_ctx;
 454		struct ceph_file_info *fi = file->private_data;
 455
 456		priv->file_ra_pages = file->f_ra.ra_pages;
 457		priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
 458
 459		rw_ctx = ceph_find_rw_context(fi);
 460		if (rw_ctx) {
 461			rreq->netfs_priv = priv;
 462			return 0;
 463		}
 464	}
 465
 466	/*
 467	 * readahead callers do not necessarily hold Fcb caps
 468	 * (e.g. fadvise, madvise).
 469	 */
 470	ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
 471	if (ret < 0) {
 472		doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
 473		goto out;
 474	}
 475
 476	if (!(got & want)) {
 477		doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
 478		ret = -EACCES;
 479		goto out;
 480	}
 481	if (ret == 0) {
 482		ret = -EACCES;
 483		goto out;
 484	}
 485
 486	priv->caps = got;
 487	rreq->netfs_priv = priv;
 488
 489out:
 490	if (ret < 0)
 491		kfree(priv);
 492
 493	return ret;
 494}
 495
 496static void ceph_netfs_free_request(struct netfs_io_request *rreq)
 497{
 498	struct ceph_netfs_request_data *priv = rreq->netfs_priv;
 499
 500	if (!priv)
 501		return;
 502
 503	if (priv->caps)
 504		ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps);
 505	kfree(priv);
 506	rreq->netfs_priv = NULL;
 507}
 508
 509const struct netfs_request_ops ceph_netfs_ops = {
 510	.init_request		= ceph_init_request,
 511	.free_request		= ceph_netfs_free_request,
 512	.begin_cache_operation	= ceph_begin_cache_operation,
 513	.issue_read		= ceph_netfs_issue_read,
 514	.expand_readahead	= ceph_netfs_expand_readahead,
 515	.clamp_length		= ceph_netfs_clamp_length,
 516	.check_write_begin	= ceph_netfs_check_write_begin,
 517};
 518
 519#ifdef CONFIG_CEPH_FSCACHE
 520static void ceph_set_page_fscache(struct page *page)
 521{
 522	set_page_fscache(page);
 523}
 524
 525static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
 526{
 527	struct inode *inode = priv;
 528
 529	if (IS_ERR_VALUE(error) && error != -ENOBUFS)
 530		ceph_fscache_invalidate(inode, false);
 531}
 532
 533static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
 534{
 535	struct ceph_inode_info *ci = ceph_inode(inode);
 536	struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
 537
 538	fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
 539			       ceph_fscache_write_terminated, inode, caching);
 540}
 541#else
 542static inline void ceph_set_page_fscache(struct page *page)
 543{
 544}
 545
 546static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
 547{
 548}
 549#endif /* CONFIG_CEPH_FSCACHE */
 550
 551struct ceph_writeback_ctl
 552{
 553	loff_t i_size;
 554	u64 truncate_size;
 555	u32 truncate_seq;
 556	bool size_stable;
 557	bool head_snapc;
 558};
 559
 560/*
 561 * Get ref for the oldest snapc for an inode with dirty data... that is, the
 562 * only snap context we are allowed to write back.
 563 */
 564static struct ceph_snap_context *
 565get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
 566		   struct ceph_snap_context *page_snapc)
 567{
 568	struct ceph_inode_info *ci = ceph_inode(inode);
 569	struct ceph_client *cl = ceph_inode_to_client(inode);
 570	struct ceph_snap_context *snapc = NULL;
 571	struct ceph_cap_snap *capsnap = NULL;
 572
 573	spin_lock(&ci->i_ceph_lock);
 574	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 575		doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
 576		      capsnap, capsnap->context, capsnap->dirty_pages);
 577		if (!capsnap->dirty_pages)
 578			continue;
 579
 580		/* get i_size, truncate_{seq,size} for page_snapc? */
 581		if (snapc && capsnap->context != page_snapc)
 582			continue;
 583
 584		if (ctl) {
 585			if (capsnap->writing) {
 586				ctl->i_size = i_size_read(inode);
 587				ctl->size_stable = false;
 588			} else {
 589				ctl->i_size = capsnap->size;
 590				ctl->size_stable = true;
 591			}
 592			ctl->truncate_size = capsnap->truncate_size;
 593			ctl->truncate_seq = capsnap->truncate_seq;
 594			ctl->head_snapc = false;
 595		}
 596
 597		if (snapc)
 598			break;
 599
 600		snapc = ceph_get_snap_context(capsnap->context);
 601		if (!page_snapc ||
 602		    page_snapc == snapc ||
 603		    page_snapc->seq > snapc->seq)
 604			break;
 605	}
 606	if (!snapc && ci->i_wrbuffer_ref_head) {
 607		snapc = ceph_get_snap_context(ci->i_head_snapc);
 608		doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
 609		      ci->i_wrbuffer_ref_head);
 610		if (ctl) {
 611			ctl->i_size = i_size_read(inode);
 612			ctl->truncate_size = ci->i_truncate_size;
 613			ctl->truncate_seq = ci->i_truncate_seq;
 614			ctl->size_stable = false;
 615			ctl->head_snapc = true;
 616		}
 617	}
 618	spin_unlock(&ci->i_ceph_lock);
 619	return snapc;
 620}
 621
 622static u64 get_writepages_data_length(struct inode *inode,
 623				      struct page *page, u64 start)
 624{
 625	struct ceph_inode_info *ci = ceph_inode(inode);
 626	struct ceph_snap_context *snapc;
 627	struct ceph_cap_snap *capsnap = NULL;
 628	u64 end = i_size_read(inode);
 629	u64 ret;
 630
 631	snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
 632	if (snapc != ci->i_head_snapc) {
 633		bool found = false;
 634		spin_lock(&ci->i_ceph_lock);
 635		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 636			if (capsnap->context == snapc) {
 637				if (!capsnap->writing)
 638					end = capsnap->size;
 639				found = true;
 640				break;
 641			}
 642		}
 643		spin_unlock(&ci->i_ceph_lock);
 644		WARN_ON(!found);
 645	}
 646	if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
 647		end = ceph_fscrypt_page_offset(page) + thp_size(page);
 648	ret = end > start ? end - start : 0;
 649	if (ret && fscrypt_is_bounce_page(page))
 650		ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
 651	return ret;
 652}
 653
 654/*
 655 * Write a single page, but leave the page locked.
 656 *
 657 * If we get a write error, mark the mapping for error, but still adjust the
 658 * dirty page accounting (i.e., page is no longer dirty).
 659 */
 660static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 661{
 662	struct folio *folio = page_folio(page);
 663	struct inode *inode = page->mapping->host;
 664	struct ceph_inode_info *ci = ceph_inode(inode);
 665	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 666	struct ceph_client *cl = fsc->client;
 667	struct ceph_snap_context *snapc, *oldest;
 668	loff_t page_off = page_offset(page);
 669	int err;
 670	loff_t len = thp_size(page);
 671	loff_t wlen;
 672	struct ceph_writeback_ctl ceph_wbc;
 673	struct ceph_osd_client *osdc = &fsc->client->osdc;
 674	struct ceph_osd_request *req;
 675	bool caching = ceph_is_cache_enabled(inode);
 676	struct page *bounce_page = NULL;
 677
 678	doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
 679	      page->index);
 680
 681	if (ceph_inode_is_shutdown(inode))
 682		return -EIO;
 683
 684	/* verify this is a writeable snap context */
 685	snapc = page_snap_context(page);
 686	if (!snapc) {
 687		doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
 688		      page);
 689		return 0;
 690	}
 691	oldest = get_oldest_context(inode, &ceph_wbc, snapc);
 692	if (snapc->seq > oldest->seq) {
 693		doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
 694		      ceph_vinop(inode), page, snapc);
 695		/* we should only noop if called by kswapd */
 696		WARN_ON(!(current->flags & PF_MEMALLOC));
 697		ceph_put_snap_context(oldest);
 698		redirty_page_for_writepage(wbc, page);
 699		return 0;
 700	}
 701	ceph_put_snap_context(oldest);
 702
 703	/* is this a partial page at end of file? */
 704	if (page_off >= ceph_wbc.i_size) {
 705		doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
 706		      ceph_vinop(inode), folio->index, ceph_wbc.i_size);
 707		folio_invalidate(folio, 0, folio_size(folio));
 708		return 0;
 709	}
 710
 711	if (ceph_wbc.i_size < page_off + len)
 712		len = ceph_wbc.i_size - page_off;
 713
 714	wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
 715	doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
 716	      ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
 717	      snapc->seq);
 718
 719	if (atomic_long_inc_return(&fsc->writeback_count) >
 720	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
 721		fsc->write_congested = true;
 722
 723	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
 724				    page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
 725				    CEPH_OSD_FLAG_WRITE, snapc,
 726				    ceph_wbc.truncate_seq,
 727				    ceph_wbc.truncate_size, true);
 728	if (IS_ERR(req)) {
 729		redirty_page_for_writepage(wbc, page);
 730		return PTR_ERR(req);
 731	}
 732
 733	if (wlen < len)
 734		len = wlen;
 735
 736	set_page_writeback(page);
 737	if (caching)
 738		ceph_set_page_fscache(page);
 739	ceph_fscache_write_to_cache(inode, page_off, len, caching);
 740
 741	if (IS_ENCRYPTED(inode)) {
 742		bounce_page = fscrypt_encrypt_pagecache_blocks(page,
 743						    CEPH_FSCRYPT_BLOCK_SIZE, 0,
 744						    GFP_NOFS);
 745		if (IS_ERR(bounce_page)) {
 746			redirty_page_for_writepage(wbc, page);
 747			end_page_writeback(page);
 748			ceph_osdc_put_request(req);
 749			return PTR_ERR(bounce_page);
 750		}
 751	}
 752
 753	/* it may be a short write due to an object boundary */
 754	WARN_ON_ONCE(len > thp_size(page));
 755	osd_req_op_extent_osd_data_pages(req, 0,
 756			bounce_page ? &bounce_page : &page, wlen, 0,
 757			false, false);
 758	doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
 759	      ceph_vinop(inode), page_off, len, wlen,
 760	      IS_ENCRYPTED(inode) ? "" : "not ");
 761
 762	req->r_mtime = inode_get_mtime(inode);
 763	ceph_osdc_start_request(osdc, req);
 764	err = ceph_osdc_wait_request(osdc, req);
 765
 766	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 767				  req->r_end_latency, len, err);
 768	fscrypt_free_bounce_page(bounce_page);
 769	ceph_osdc_put_request(req);
 770	if (err == 0)
 771		err = len;
 772
 773	if (err < 0) {
 774		struct writeback_control tmp_wbc;
 775		if (!wbc)
 776			wbc = &tmp_wbc;
 777		if (err == -ERESTARTSYS) {
 778			/* killed by SIGKILL */
 779			doutc(cl, "%llx.%llx interrupted page %p\n",
 780			      ceph_vinop(inode), page);
 781			redirty_page_for_writepage(wbc, page);
 782			end_page_writeback(page);
 783			return err;
 784		}
 785		if (err == -EBLOCKLISTED)
 786			fsc->blocklisted = true;
 787		doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
 788		      ceph_vinop(inode), err, page);
 789		mapping_set_error(&inode->i_data, err);
 790		wbc->pages_skipped++;
 791	} else {
 792		doutc(cl, "%llx.%llx cleaned page %p\n",
 793		      ceph_vinop(inode), page);
 794		err = 0;  /* vfs expects us to return 0 */
 795	}
 796	oldest = detach_page_private(page);
 797	WARN_ON_ONCE(oldest != snapc);
 798	end_page_writeback(page);
 799	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 800	ceph_put_snap_context(snapc);  /* page's reference */
 801
 802	if (atomic_long_dec_return(&fsc->writeback_count) <
 803	    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
 804		fsc->write_congested = false;
 805
 806	return err;
 807}
 808
 809static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 810{
 811	int err;
 812	struct inode *inode = page->mapping->host;
 813	BUG_ON(!inode);
 814	ihold(inode);
 815
 816	if (wbc->sync_mode == WB_SYNC_NONE &&
 817	    ceph_inode_to_fs_client(inode)->write_congested)
 818		return AOP_WRITEPAGE_ACTIVATE;
 819
 820	wait_on_page_fscache(page);
 821
 822	err = writepage_nounlock(page, wbc);
 823	if (err == -ERESTARTSYS) {
 824		/* direct memory reclaimer was killed by SIGKILL. return 0
 825		 * to prevent caller from setting mapping/page error */
 826		err = 0;
 827	}
 828	unlock_page(page);
 829	iput(inode);
 830	return err;
 831}
 832
 833/*
 834 * async writeback completion handler.
 835 *
 836 * If we get an error, set the mapping error bit, but not the individual
 837 * page error bits.
 838 */
 839static void writepages_finish(struct ceph_osd_request *req)
 840{
 841	struct inode *inode = req->r_inode;
 842	struct ceph_inode_info *ci = ceph_inode(inode);
 843	struct ceph_client *cl = ceph_inode_to_client(inode);
 844	struct ceph_osd_data *osd_data;
 845	struct page *page;
 846	int num_pages, total_pages = 0;
 847	int i, j;
 848	int rc = req->r_result;
 849	struct ceph_snap_context *snapc = req->r_snapc;
 850	struct address_space *mapping = inode->i_mapping;
 851	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 852	unsigned int len = 0;
 853	bool remove_page;
 854
 855	doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
 856	if (rc < 0) {
 857		mapping_set_error(mapping, rc);
 858		ceph_set_error_write(ci);
 859		if (rc == -EBLOCKLISTED)
 860			fsc->blocklisted = true;
 861	} else {
 862		ceph_clear_error_write(ci);
 863	}
 864
 865	/*
 866	 * We lost the cache cap, need to truncate the page before
 867	 * it is unlocked, otherwise we'd truncate it later in the
 868	 * page truncation thread, possibly losing some data that
 869	 * raced its way in
 870	 */
 871	remove_page = !(ceph_caps_issued(ci) &
 872			(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
 873
 874	/* clean all pages */
 875	for (i = 0; i < req->r_num_ops; i++) {
 876		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
 877			pr_warn_client(cl,
 878				"%llx.%llx incorrect op %d req %p index %d tid %llu\n",
 879				ceph_vinop(inode), req->r_ops[i].op, req, i,
 880				req->r_tid);
 881			break;
 882		}
 883
 884		osd_data = osd_req_op_extent_osd_data(req, i);
 885		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
 886		len += osd_data->length;
 887		num_pages = calc_pages_for((u64)osd_data->alignment,
 888					   (u64)osd_data->length);
 889		total_pages += num_pages;
 890		for (j = 0; j < num_pages; j++) {
 891			page = osd_data->pages[j];
 892			if (fscrypt_is_bounce_page(page)) {
 893				page = fscrypt_pagecache_page(page);
 894				fscrypt_free_bounce_page(osd_data->pages[j]);
 895				osd_data->pages[j] = page;
 896			}
 897			BUG_ON(!page);
 898			WARN_ON(!PageUptodate(page));
 899
 900			if (atomic_long_dec_return(&fsc->writeback_count) <
 901			     CONGESTION_OFF_THRESH(
 902					fsc->mount_options->congestion_kb))
 903				fsc->write_congested = false;
 904
 905			ceph_put_snap_context(detach_page_private(page));
 906			end_page_writeback(page);
 907			doutc(cl, "unlocking %p\n", page);
 908
 909			if (remove_page)
 910				generic_error_remove_page(inode->i_mapping,
 911							  page);
 912
 913			unlock_page(page);
 914		}
 915		doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
 916		      ceph_vinop(inode), osd_data->length,
 917		      rc >= 0 ? num_pages : 0);
 918
 919		release_pages(osd_data->pages, num_pages);
 920	}
 921
 922	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 923				  req->r_end_latency, len, rc);
 924
 925	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
 926
 927	osd_data = osd_req_op_extent_osd_data(req, 0);
 928	if (osd_data->pages_from_pool)
 929		mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
 930	else
 931		kfree(osd_data->pages);
 932	ceph_osdc_put_request(req);
 933	ceph_dec_osd_stopping_blocker(fsc->mdsc);
 934}
 935
 936/*
 937 * initiate async writeback
 938 */
 939static int ceph_writepages_start(struct address_space *mapping,
 940				 struct writeback_control *wbc)
 941{
 942	struct inode *inode = mapping->host;
 943	struct ceph_inode_info *ci = ceph_inode(inode);
 944	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 945	struct ceph_client *cl = fsc->client;
 946	struct ceph_vino vino = ceph_vino(inode);
 947	pgoff_t index, start_index, end = -1;
 948	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
 949	struct folio_batch fbatch;
 950	int rc = 0;
 951	unsigned int wsize = i_blocksize(inode);
 952	struct ceph_osd_request *req = NULL;
 953	struct ceph_writeback_ctl ceph_wbc;
 954	bool should_loop, range_whole = false;
 955	bool done = false;
 956	bool caching = ceph_is_cache_enabled(inode);
 957	xa_mark_t tag;
 958
 959	if (wbc->sync_mode == WB_SYNC_NONE &&
 960	    fsc->write_congested)
 961		return 0;
 962
 963	doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
 964	      wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 965	      (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 966
 967	if (ceph_inode_is_shutdown(inode)) {
 968		if (ci->i_wrbuffer_ref > 0) {
 969			pr_warn_ratelimited_client(cl,
 970				"%llx.%llx %lld forced umount\n",
 971				ceph_vinop(inode), ceph_ino(inode));
 972		}
 973		mapping_set_error(mapping, -EIO);
 974		return -EIO; /* we're in a forced umount, don't write! */
 975	}
 976	if (fsc->mount_options->wsize < wsize)
 977		wsize = fsc->mount_options->wsize;
 978
 979	folio_batch_init(&fbatch);
 980
 981	start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
 982	index = start_index;
 983
 984	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
 985		tag = PAGECACHE_TAG_TOWRITE;
 986	} else {
 987		tag = PAGECACHE_TAG_DIRTY;
 988	}
 989retry:
 990	/* find oldest snap context with dirty data */
 991	snapc = get_oldest_context(inode, &ceph_wbc, NULL);
 992	if (!snapc) {
 993		/* hmm, why does writepages get called when there
 994		   is no dirty data? */
 995		doutc(cl, " no snap context with dirty data?\n");
 996		goto out;
 997	}
 998	doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
 999	      snapc->seq, snapc->num_snaps);
1000
1001	should_loop = false;
1002	if (ceph_wbc.head_snapc && snapc != last_snapc) {
1003		/* where to start/end? */
1004		if (wbc->range_cyclic) {
1005			index = start_index;
1006			end = -1;
1007			if (index > 0)
1008				should_loop = true;
1009			doutc(cl, " cyclic, start at %lu\n", index);
1010		} else {
1011			index = wbc->range_start >> PAGE_SHIFT;
1012			end = wbc->range_end >> PAGE_SHIFT;
1013			if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1014				range_whole = true;
1015			doutc(cl, " not cyclic, %lu to %lu\n", index, end);
1016		}
1017	} else if (!ceph_wbc.head_snapc) {
1018		/* Do not respect wbc->range_{start,end}. Dirty pages
1019		 * in that range can be associated with newer snapc.
1020		 * They are not writeable until we write all dirty pages
1021		 * associated with 'snapc' get written */
1022		if (index > 0)
1023			should_loop = true;
1024		doutc(cl, " non-head snapc, range whole\n");
1025	}
1026
1027	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1028		tag_pages_for_writeback(mapping, index, end);
1029
1030	ceph_put_snap_context(last_snapc);
1031	last_snapc = snapc;
1032
1033	while (!done && index <= end) {
1034		int num_ops = 0, op_idx;
1035		unsigned i, nr_folios, max_pages, locked_pages = 0;
1036		struct page **pages = NULL, **data_pages;
1037		struct page *page;
1038		pgoff_t strip_unit_end = 0;
1039		u64 offset = 0, len = 0;
1040		bool from_pool = false;
1041
1042		max_pages = wsize >> PAGE_SHIFT;
1043
1044get_more_pages:
1045		nr_folios = filemap_get_folios_tag(mapping, &index,
1046						   end, tag, &fbatch);
1047		doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
1048		if (!nr_folios && !locked_pages)
1049			break;
1050		for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
1051			page = &fbatch.folios[i]->page;
1052			doutc(cl, "? %p idx %lu\n", page, page->index);
1053			if (locked_pages == 0)
1054				lock_page(page);  /* first page */
1055			else if (!trylock_page(page))
1056				break;
1057
1058			/* only dirty pages, or our accounting breaks */
1059			if (unlikely(!PageDirty(page)) ||
1060			    unlikely(page->mapping != mapping)) {
1061				doutc(cl, "!dirty or !mapping %p\n", page);
1062				unlock_page(page);
1063				continue;
1064			}
1065			/* only if matching snap context */
1066			pgsnapc = page_snap_context(page);
1067			if (pgsnapc != snapc) {
1068				doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
1069				      pgsnapc, pgsnapc->seq, snapc, snapc->seq);
1070				if (!should_loop &&
1071				    !ceph_wbc.head_snapc &&
1072				    wbc->sync_mode != WB_SYNC_NONE)
1073					should_loop = true;
1074				unlock_page(page);
1075				continue;
1076			}
1077			if (page_offset(page) >= ceph_wbc.i_size) {
1078				struct folio *folio = page_folio(page);
1079
1080				doutc(cl, "folio at %lu beyond eof %llu\n",
1081				      folio->index, ceph_wbc.i_size);
1082				if ((ceph_wbc.size_stable ||
1083				    folio_pos(folio) >= i_size_read(inode)) &&
1084				    folio_clear_dirty_for_io(folio))
1085					folio_invalidate(folio, 0,
1086							folio_size(folio));
1087				folio_unlock(folio);
1088				continue;
1089			}
1090			if (strip_unit_end && (page->index > strip_unit_end)) {
1091				doutc(cl, "end of strip unit %p\n", page);
1092				unlock_page(page);
1093				break;
1094			}
1095			if (PageWriteback(page) || PageFsCache(page)) {
1096				if (wbc->sync_mode == WB_SYNC_NONE) {
1097					doutc(cl, "%p under writeback\n", page);
1098					unlock_page(page);
1099					continue;
1100				}
1101				doutc(cl, "waiting on writeback %p\n", page);
1102				wait_on_page_writeback(page);
1103				wait_on_page_fscache(page);
1104			}
1105
1106			if (!clear_page_dirty_for_io(page)) {
1107				doutc(cl, "%p !clear_page_dirty_for_io\n", page);
1108				unlock_page(page);
1109				continue;
1110			}
1111
1112			/*
1113			 * We have something to write.  If this is
1114			 * the first locked page this time through,
1115			 * calculate max possinle write size and
1116			 * allocate a page array
1117			 */
1118			if (locked_pages == 0) {
1119				u64 objnum;
1120				u64 objoff;
1121				u32 xlen;
1122
1123				/* prepare async write request */
1124				offset = (u64)page_offset(page);
1125				ceph_calc_file_object_mapping(&ci->i_layout,
1126							      offset, wsize,
1127							      &objnum, &objoff,
1128							      &xlen);
1129				len = xlen;
1130
1131				num_ops = 1;
1132				strip_unit_end = page->index +
1133					((len - 1) >> PAGE_SHIFT);
1134
1135				BUG_ON(pages);
1136				max_pages = calc_pages_for(0, (u64)len);
1137				pages = kmalloc_array(max_pages,
1138						      sizeof(*pages),
1139						      GFP_NOFS);
1140				if (!pages) {
1141					from_pool = true;
1142					pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
1143					BUG_ON(!pages);
1144				}
1145
1146				len = 0;
1147			} else if (page->index !=
1148				   (offset + len) >> PAGE_SHIFT) {
1149				if (num_ops >= (from_pool ?  CEPH_OSD_SLAB_OPS :
1150							     CEPH_OSD_MAX_OPS)) {
1151					redirty_page_for_writepage(wbc, page);
1152					unlock_page(page);
1153					break;
1154				}
1155
1156				num_ops++;
1157				offset = (u64)page_offset(page);
1158				len = 0;
1159			}
1160
1161			/* note position of first page in fbatch */
1162			doutc(cl, "%llx.%llx will write page %p idx %lu\n",
1163			      ceph_vinop(inode), page, page->index);
1164
1165			if (atomic_long_inc_return(&fsc->writeback_count) >
1166			    CONGESTION_ON_THRESH(
1167				    fsc->mount_options->congestion_kb))
1168				fsc->write_congested = true;
1169
1170			if (IS_ENCRYPTED(inode)) {
1171				pages[locked_pages] =
1172					fscrypt_encrypt_pagecache_blocks(page,
1173						PAGE_SIZE, 0,
1174						locked_pages ? GFP_NOWAIT : GFP_NOFS);
1175				if (IS_ERR(pages[locked_pages])) {
1176					if (PTR_ERR(pages[locked_pages]) == -EINVAL)
1177						pr_err_client(cl,
1178							"inode->i_blkbits=%hhu\n",
1179							inode->i_blkbits);
1180					/* better not fail on first page! */
1181					BUG_ON(locked_pages == 0);
1182					pages[locked_pages] = NULL;
1183					redirty_page_for_writepage(wbc, page);
1184					unlock_page(page);
1185					break;
1186				}
1187				++locked_pages;
1188			} else {
1189				pages[locked_pages++] = page;
1190			}
1191
1192			fbatch.folios[i] = NULL;
1193			len += thp_size(page);
1194		}
1195
1196		/* did we get anything? */
1197		if (!locked_pages)
1198			goto release_folios;
1199		if (i) {
1200			unsigned j, n = 0;
1201			/* shift unused page to beginning of fbatch */
1202			for (j = 0; j < nr_folios; j++) {
1203				if (!fbatch.folios[j])
1204					continue;
1205				if (n < j)
1206					fbatch.folios[n] = fbatch.folios[j];
1207				n++;
1208			}
1209			fbatch.nr = n;
1210
1211			if (nr_folios && i == nr_folios &&
1212			    locked_pages < max_pages) {
1213				doutc(cl, "reached end fbatch, trying for more\n");
1214				folio_batch_release(&fbatch);
1215				goto get_more_pages;
1216			}
1217		}
1218
1219new_request:
1220		offset = ceph_fscrypt_page_offset(pages[0]);
1221		len = wsize;
1222
1223		req = ceph_osdc_new_request(&fsc->client->osdc,
1224					&ci->i_layout, vino,
1225					offset, &len, 0, num_ops,
1226					CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
1227					snapc, ceph_wbc.truncate_seq,
1228					ceph_wbc.truncate_size, false);
1229		if (IS_ERR(req)) {
1230			req = ceph_osdc_new_request(&fsc->client->osdc,
1231						&ci->i_layout, vino,
1232						offset, &len, 0,
1233						min(num_ops,
1234						    CEPH_OSD_SLAB_OPS),
1235						CEPH_OSD_OP_WRITE,
1236						CEPH_OSD_FLAG_WRITE,
1237						snapc, ceph_wbc.truncate_seq,
1238						ceph_wbc.truncate_size, true);
1239			BUG_ON(IS_ERR(req));
1240		}
1241		BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
1242			     thp_size(pages[locked_pages - 1]) - offset);
1243
1244		if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
1245			rc = -EIO;
1246			goto release_folios;
1247		}
1248		req->r_callback = writepages_finish;
1249		req->r_inode = inode;
1250
1251		/* Format the osd request message and submit the write */
1252		len = 0;
1253		data_pages = pages;
1254		op_idx = 0;
1255		for (i = 0; i < locked_pages; i++) {
1256			struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
1257
1258			u64 cur_offset = page_offset(page);
1259			/*
1260			 * Discontinuity in page range? Ceph can handle that by just passing
1261			 * multiple extents in the write op.
1262			 */
1263			if (offset + len != cur_offset) {
1264				/* If it's full, stop here */
1265				if (op_idx + 1 == req->r_num_ops)
1266					break;
1267
1268				/* Kick off an fscache write with what we have so far. */
1269				ceph_fscache_write_to_cache(inode, offset, len, caching);
1270
1271				/* Start a new extent */
1272				osd_req_op_extent_dup_last(req, op_idx,
1273							   cur_offset - offset);
1274				doutc(cl, "got pages at %llu~%llu\n", offset,
1275				      len);
1276				osd_req_op_extent_osd_data_pages(req, op_idx,
1277							data_pages, len, 0,
1278							from_pool, false);
1279				osd_req_op_extent_update(req, op_idx, len);
1280
1281				len = 0;
1282				offset = cur_offset;
1283				data_pages = pages + i;
1284				op_idx++;
1285			}
1286
1287			set_page_writeback(page);
1288			if (caching)
1289				ceph_set_page_fscache(page);
1290			len += thp_size(page);
1291		}
1292		ceph_fscache_write_to_cache(inode, offset, len, caching);
1293
1294		if (ceph_wbc.size_stable) {
1295			len = min(len, ceph_wbc.i_size - offset);
1296		} else if (i == locked_pages) {
1297			/* writepages_finish() clears writeback pages
1298			 * according to the data length, so make sure
1299			 * data length covers all locked pages */
1300			u64 min_len = len + 1 - thp_size(page);
1301			len = get_writepages_data_length(inode, pages[i - 1],
1302							 offset);
1303			len = max(len, min_len);
1304		}
1305		if (IS_ENCRYPTED(inode))
1306			len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
1307
1308		doutc(cl, "got pages at %llu~%llu\n", offset, len);
1309
1310		if (IS_ENCRYPTED(inode) &&
1311		    ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
1312			pr_warn_client(cl,
1313				"bad encrypted write offset=%lld len=%llu\n",
1314				offset, len);
1315
1316		osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
1317						 0, from_pool, false);
1318		osd_req_op_extent_update(req, op_idx, len);
1319
1320		BUG_ON(op_idx + 1 != req->r_num_ops);
1321
1322		from_pool = false;
1323		if (i < locked_pages) {
1324			BUG_ON(num_ops <= req->r_num_ops);
1325			num_ops -= req->r_num_ops;
1326			locked_pages -= i;
1327
1328			/* allocate new pages array for next request */
1329			data_pages = pages;
1330			pages = kmalloc_array(locked_pages, sizeof(*pages),
1331					      GFP_NOFS);
1332			if (!pages) {
1333				from_pool = true;
1334				pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
1335				BUG_ON(!pages);
1336			}
1337			memcpy(pages, data_pages + i,
1338			       locked_pages * sizeof(*pages));
1339			memset(data_pages + i, 0,
1340			       locked_pages * sizeof(*pages));
1341		} else {
1342			BUG_ON(num_ops != req->r_num_ops);
1343			index = pages[i - 1]->index + 1;
1344			/* request message now owns the pages array */
1345			pages = NULL;
1346		}
1347
1348		req->r_mtime = inode_get_mtime(inode);
1349		ceph_osdc_start_request(&fsc->client->osdc, req);
1350		req = NULL;
1351
1352		wbc->nr_to_write -= i;
1353		if (pages)
1354			goto new_request;
1355
1356		/*
1357		 * We stop writing back only if we are not doing
1358		 * integrity sync. In case of integrity sync we have to
1359		 * keep going until we have written all the pages
1360		 * we tagged for writeback prior to entering this loop.
1361		 */
1362		if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
1363			done = true;
1364
1365release_folios:
1366		doutc(cl, "folio_batch release on %d folios (%p)\n",
1367		      (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
1368		folio_batch_release(&fbatch);
1369	}
1370
1371	if (should_loop && !done) {
1372		/* more to do; loop back to beginning of file */
1373		doutc(cl, "looping back to beginning of file\n");
1374		end = start_index - 1; /* OK even when start_index == 0 */
1375
1376		/* to write dirty pages associated with next snapc,
1377		 * we need to wait until current writes complete */
1378		if (wbc->sync_mode != WB_SYNC_NONE &&
1379		    start_index == 0 && /* all dirty pages were checked */
1380		    !ceph_wbc.head_snapc) {
1381			struct page *page;
1382			unsigned i, nr;
1383			index = 0;
1384			while ((index <= end) &&
1385			       (nr = filemap_get_folios_tag(mapping, &index,
1386						(pgoff_t)-1,
1387						PAGECACHE_TAG_WRITEBACK,
1388						&fbatch))) {
1389				for (i = 0; i < nr; i++) {
1390					page = &fbatch.folios[i]->page;
1391					if (page_snap_context(page) != snapc)
1392						continue;
1393					wait_on_page_writeback(page);
1394				}
1395				folio_batch_release(&fbatch);
1396				cond_resched();
1397			}
1398		}
1399
1400		start_index = 0;
1401		index = 0;
1402		goto retry;
1403	}
1404
1405	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1406		mapping->writeback_index = index;
1407
1408out:
1409	ceph_osdc_put_request(req);
1410	ceph_put_snap_context(last_snapc);
1411	doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
1412	      rc);
1413	return rc;
1414}
1415
1416
1417
1418/*
1419 * See if a given @snapc is either writeable, or already written.
1420 */
1421static int context_is_writeable_or_written(struct inode *inode,
1422					   struct ceph_snap_context *snapc)
1423{
1424	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
1425	int ret = !oldest || snapc->seq <= oldest->seq;
1426
1427	ceph_put_snap_context(oldest);
1428	return ret;
1429}
1430
1431/**
1432 * ceph_find_incompatible - find an incompatible context and return it
1433 * @page: page being dirtied
1434 *
1435 * We are only allowed to write into/dirty a page if the page is
1436 * clean, or already dirty within the same snap context. Returns a
1437 * conflicting context if there is one, NULL if there isn't, or a
1438 * negative error code on other errors.
1439 *
1440 * Must be called with page lock held.
1441 */
1442static struct ceph_snap_context *
1443ceph_find_incompatible(struct page *page)
1444{
1445	struct inode *inode = page->mapping->host;
1446	struct ceph_client *cl = ceph_inode_to_client(inode);
1447	struct ceph_inode_info *ci = ceph_inode(inode);
1448
1449	if (ceph_inode_is_shutdown(inode)) {
1450		doutc(cl, " %llx.%llx page %p is shutdown\n",
1451		      ceph_vinop(inode), page);
1452		return ERR_PTR(-ESTALE);
1453	}
1454
1455	for (;;) {
1456		struct ceph_snap_context *snapc, *oldest;
1457
1458		wait_on_page_writeback(page);
1459
1460		snapc = page_snap_context(page);
1461		if (!snapc || snapc == ci->i_head_snapc)
1462			break;
1463
1464		/*
1465		 * this page is already dirty in another (older) snap
1466		 * context!  is it writeable now?
1467		 */
1468		oldest = get_oldest_context(inode, NULL, NULL);
1469		if (snapc->seq > oldest->seq) {
1470			/* not writeable -- return it for the caller to deal with */
1471			ceph_put_snap_context(oldest);
1472			doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
1473			      ceph_vinop(inode), page, snapc);
1474			return ceph_get_snap_context(snapc);
1475		}
1476		ceph_put_snap_context(oldest);
1477
1478		/* yay, writeable, do it now (without dropping page lock) */
1479		doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
1480		      ceph_vinop(inode), page, snapc);
1481		if (clear_page_dirty_for_io(page)) {
1482			int r = writepage_nounlock(page, NULL);
1483			if (r < 0)
1484				return ERR_PTR(r);
1485		}
1486	}
1487	return NULL;
1488}
1489
1490static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
1491					struct folio **foliop, void **_fsdata)
1492{
1493	struct inode *inode = file_inode(file);
1494	struct ceph_inode_info *ci = ceph_inode(inode);
1495	struct ceph_snap_context *snapc;
1496
1497	snapc = ceph_find_incompatible(folio_page(*foliop, 0));
1498	if (snapc) {
1499		int r;
1500
1501		folio_unlock(*foliop);
1502		folio_put(*foliop);
1503		*foliop = NULL;
1504		if (IS_ERR(snapc))
1505			return PTR_ERR(snapc);
1506
1507		ceph_queue_writeback(inode);
1508		r = wait_event_killable(ci->i_cap_wq,
1509					context_is_writeable_or_written(inode, snapc));
1510		ceph_put_snap_context(snapc);
1511		return r == 0 ? -EAGAIN : r;
1512	}
1513	return 0;
1514}
1515
1516/*
1517 * We are only allowed to write into/dirty the page if the page is
1518 * clean, or already dirty within the same snap context.
1519 */
1520static int ceph_write_begin(struct file *file, struct address_space *mapping,
1521			    loff_t pos, unsigned len,
1522			    struct page **pagep, void **fsdata)
1523{
1524	struct inode *inode = file_inode(file);
1525	struct ceph_inode_info *ci = ceph_inode(inode);
1526	struct folio *folio = NULL;
1527	int r;
1528
1529	r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
1530	if (r < 0)
1531		return r;
1532
1533	folio_wait_fscache(folio);
1534	WARN_ON_ONCE(!folio_test_locked(folio));
1535	*pagep = &folio->page;
1536	return 0;
1537}
1538
1539/*
1540 * we don't do anything in here that simple_write_end doesn't do
1541 * except adjust dirty page accounting
1542 */
1543static int ceph_write_end(struct file *file, struct address_space *mapping,
1544			  loff_t pos, unsigned len, unsigned copied,
1545			  struct page *subpage, void *fsdata)
1546{
1547	struct folio *folio = page_folio(subpage);
1548	struct inode *inode = file_inode(file);
1549	struct ceph_client *cl = ceph_inode_to_client(inode);
1550	bool check_cap = false;
1551
1552	doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
1553	      file, folio, (int)pos, (int)copied, (int)len);
1554
1555	if (!folio_test_uptodate(folio)) {
1556		/* just return that nothing was copied on a short copy */
1557		if (copied < len) {
1558			copied = 0;
1559			goto out;
1560		}
1561		folio_mark_uptodate(folio);
1562	}
1563
1564	/* did file size increase? */
1565	if (pos+copied > i_size_read(inode))
1566		check_cap = ceph_inode_set_size(inode, pos+copied);
1567
1568	folio_mark_dirty(folio);
1569
1570out:
1571	folio_unlock(folio);
1572	folio_put(folio);
1573
1574	if (check_cap)
1575		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
1576
1577	return copied;
1578}
1579
1580const struct address_space_operations ceph_aops = {
1581	.read_folio = netfs_read_folio,
1582	.readahead = netfs_readahead,
1583	.writepage = ceph_writepage,
1584	.writepages = ceph_writepages_start,
1585	.write_begin = ceph_write_begin,
1586	.write_end = ceph_write_end,
1587	.dirty_folio = ceph_dirty_folio,
1588	.invalidate_folio = ceph_invalidate_folio,
1589	.release_folio = ceph_release_folio,
1590	.direct_IO = noop_direct_IO,
1591};
1592
1593static void ceph_block_sigs(sigset_t *oldset)
1594{
1595	sigset_t mask;
1596	siginitsetinv(&mask, sigmask(SIGKILL));
1597	sigprocmask(SIG_BLOCK, &mask, oldset);
1598}
1599
1600static void ceph_restore_sigs(sigset_t *oldset)
1601{
1602	sigprocmask(SIG_SETMASK, oldset, NULL);
1603}
1604
1605/*
1606 * vm ops
1607 */
1608static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
1609{
1610	struct vm_area_struct *vma = vmf->vma;
1611	struct inode *inode = file_inode(vma->vm_file);
1612	struct ceph_inode_info *ci = ceph_inode(inode);
1613	struct ceph_client *cl = ceph_inode_to_client(inode);
1614	struct ceph_file_info *fi = vma->vm_file->private_data;
1615	loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
1616	int want, got, err;
1617	sigset_t oldset;
1618	vm_fault_t ret = VM_FAULT_SIGBUS;
1619
1620	if (ceph_inode_is_shutdown(inode))
1621		return ret;
1622
1623	ceph_block_sigs(&oldset);
1624
1625	doutc(cl, "%llx.%llx %llu trying to get caps\n",
1626	      ceph_vinop(inode), off);
1627	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1628		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1629	else
1630		want = CEPH_CAP_FILE_CACHE;
1631
1632	got = 0;
1633	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
1634	if (err < 0)
1635		goto out_restore;
1636
1637	doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
1638	      off, ceph_cap_string(got));
1639
1640	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
1641	    !ceph_has_inline_data(ci)) {
1642		CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1643		ceph_add_rw_context(fi, &rw_ctx);
1644		ret = filemap_fault(vmf);
1645		ceph_del_rw_context(fi, &rw_ctx);
1646		doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
1647		      ceph_vinop(inode), off, ceph_cap_string(got), ret);
1648	} else
1649		err = -EAGAIN;
1650
1651	ceph_put_cap_refs(ci, got);
1652
1653	if (err != -EAGAIN)
1654		goto out_restore;
1655
1656	/* read inline data */
1657	if (off >= PAGE_SIZE) {
1658		/* does not support inline data > PAGE_SIZE */
1659		ret = VM_FAULT_SIGBUS;
1660	} else {
1661		struct address_space *mapping = inode->i_mapping;
1662		struct page *page;
1663
1664		filemap_invalidate_lock_shared(mapping);
1665		page = find_or_create_page(mapping, 0,
1666				mapping_gfp_constraint(mapping, ~__GFP_FS));
1667		if (!page) {
1668			ret = VM_FAULT_OOM;
1669			goto out_inline;
1670		}
1671		err = __ceph_do_getattr(inode, page,
1672					 CEPH_STAT_CAP_INLINE_DATA, true);
1673		if (err < 0 || off >= i_size_read(inode)) {
1674			unlock_page(page);
1675			put_page(page);
1676			ret = vmf_error(err);
1677			goto out_inline;
1678		}
1679		if (err < PAGE_SIZE)
1680			zero_user_segment(page, err, PAGE_SIZE);
1681		else
1682			flush_dcache_page(page);
1683		SetPageUptodate(page);
1684		vmf->page = page;
1685		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
1686out_inline:
1687		filemap_invalidate_unlock_shared(mapping);
1688		doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
1689		      ceph_vinop(inode), off, ret);
1690	}
1691out_restore:
1692	ceph_restore_sigs(&oldset);
1693	if (err < 0)
1694		ret = vmf_error(err);
1695
1696	return ret;
1697}
1698
1699static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1700{
1701	struct vm_area_struct *vma = vmf->vma;
1702	struct inode *inode = file_inode(vma->vm_file);
1703	struct ceph_client *cl = ceph_inode_to_client(inode);
1704	struct ceph_inode_info *ci = ceph_inode(inode);
1705	struct ceph_file_info *fi = vma->vm_file->private_data;
1706	struct ceph_cap_flush *prealloc_cf;
1707	struct page *page = vmf->page;
1708	loff_t off = page_offset(page);
1709	loff_t size = i_size_read(inode);
1710	size_t len;
1711	int want, got, err;
1712	sigset_t oldset;
1713	vm_fault_t ret = VM_FAULT_SIGBUS;
1714
1715	if (ceph_inode_is_shutdown(inode))
1716		return ret;
1717
1718	prealloc_cf = ceph_alloc_cap_flush();
1719	if (!prealloc_cf)
1720		return VM_FAULT_OOM;
1721
1722	sb_start_pagefault(inode->i_sb);
1723	ceph_block_sigs(&oldset);
1724
1725	if (off + thp_size(page) <= size)
1726		len = thp_size(page);
1727	else
1728		len = offset_in_thp(page, size);
1729
1730	doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
1731	      ceph_vinop(inode), off, len, size);
1732	if (fi->fmode & CEPH_FILE_MODE_LAZY)
1733		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1734	else
1735		want = CEPH_CAP_FILE_BUFFER;
1736
1737	got = 0;
1738	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
1739	if (err < 0)
1740		goto out_free;
1741
1742	doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
1743	      off, len, ceph_cap_string(got));
1744
1745	/* Update time before taking page lock */
1746	file_update_time(vma->vm_file);
1747	inode_inc_iversion_raw(inode);
1748
1749	do {
1750		struct ceph_snap_context *snapc;
1751
1752		lock_page(page);
1753
1754		if (page_mkwrite_check_truncate(page, inode) < 0) {
1755			unlock_page(page);
1756			ret = VM_FAULT_NOPAGE;
1757			break;
1758		}
1759
1760		snapc = ceph_find_incompatible(page);
1761		if (!snapc) {
1762			/* success.  we'll keep the page locked. */
1763			set_page_dirty(page);
1764			ret = VM_FAULT_LOCKED;
1765			break;
1766		}
1767
1768		unlock_page(page);
1769
1770		if (IS_ERR(snapc)) {
1771			ret = VM_FAULT_SIGBUS;
1772			break;
1773		}
1774
1775		ceph_queue_writeback(inode);
1776		err = wait_event_killable(ci->i_cap_wq,
1777				context_is_writeable_or_written(inode, snapc));
1778		ceph_put_snap_context(snapc);
1779	} while (err == 0);
1780
1781	if (ret == VM_FAULT_LOCKED) {
1782		int dirty;
1783		spin_lock(&ci->i_ceph_lock);
1784		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1785					       &prealloc_cf);
1786		spin_unlock(&ci->i_ceph_lock);
1787		if (dirty)
1788			__mark_inode_dirty(inode, dirty);
1789	}
1790
1791	doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
1792	      ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
1793	ceph_put_cap_refs_async(ci, got);
1794out_free:
1795	ceph_restore_sigs(&oldset);
1796	sb_end_pagefault(inode->i_sb);
1797	ceph_free_cap_flush(prealloc_cf);
1798	if (err < 0)
1799		ret = vmf_error(err);
1800	return ret;
1801}
1802
1803void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
1804			   char	*data, size_t len)
1805{
1806	struct ceph_client *cl = ceph_inode_to_client(inode);
1807	struct address_space *mapping = inode->i_mapping;
1808	struct page *page;
1809
1810	if (locked_page) {
1811		page = locked_page;
1812	} else {
1813		if (i_size_read(inode) == 0)
1814			return;
1815		page = find_or_create_page(mapping, 0,
1816					   mapping_gfp_constraint(mapping,
1817					   ~__GFP_FS));
1818		if (!page)
1819			return;
1820		if (PageUptodate(page)) {
1821			unlock_page(page);
1822			put_page(page);
1823			return;
1824		}
1825	}
1826
1827	doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
1828	      ceph_vinop(inode), len, locked_page);
1829
1830	if (len > 0) {
1831		void *kaddr = kmap_atomic(page);
1832		memcpy(kaddr, data, len);
1833		kunmap_atomic(kaddr);
1834	}
1835
1836	if (page != locked_page) {
1837		if (len < PAGE_SIZE)
1838			zero_user_segment(page, len, PAGE_SIZE);
1839		else
1840			flush_dcache_page(page);
1841
1842		SetPageUptodate(page);
1843		unlock_page(page);
1844		put_page(page);
1845	}
1846}
1847
1848int ceph_uninline_data(struct file *file)
1849{
1850	struct inode *inode = file_inode(file);
1851	struct ceph_inode_info *ci = ceph_inode(inode);
1852	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
1853	struct ceph_client *cl = fsc->client;
1854	struct ceph_osd_request *req = NULL;
1855	struct ceph_cap_flush *prealloc_cf = NULL;
1856	struct folio *folio = NULL;
1857	u64 inline_version = CEPH_INLINE_NONE;
1858	struct page *pages[1];
1859	int err = 0;
1860	u64 len;
1861
1862	spin_lock(&ci->i_ceph_lock);
1863	inline_version = ci->i_inline_version;
1864	spin_unlock(&ci->i_ceph_lock);
1865
1866	doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
1867	      inline_version);
1868
1869	if (ceph_inode_is_shutdown(inode)) {
1870		err = -EIO;
1871		goto out;
1872	}
1873
1874	if (inline_version == CEPH_INLINE_NONE)
1875		return 0;
1876
1877	prealloc_cf = ceph_alloc_cap_flush();
1878	if (!prealloc_cf)
1879		return -ENOMEM;
1880
1881	if (inline_version == 1) /* initial version, no data */
1882		goto out_uninline;
1883
1884	folio = read_mapping_folio(inode->i_mapping, 0, file);
1885	if (IS_ERR(folio)) {
1886		err = PTR_ERR(folio);
1887		goto out;
1888	}
1889
1890	folio_lock(folio);
1891
1892	len = i_size_read(inode);
1893	if (len > folio_size(folio))
1894		len = folio_size(folio);
1895
1896	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1897				    ceph_vino(inode), 0, &len, 0, 1,
1898				    CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
1899				    NULL, 0, 0, false);
1900	if (IS_ERR(req)) {
1901		err = PTR_ERR(req);
1902		goto out_unlock;
1903	}
1904
1905	req->r_mtime = inode_get_mtime(inode);
1906	ceph_osdc_start_request(&fsc->client->osdc, req);
1907	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1908	ceph_osdc_put_request(req);
1909	if (err < 0)
1910		goto out_unlock;
1911
1912	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1913				    ceph_vino(inode), 0, &len, 1, 3,
1914				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
1915				    NULL, ci->i_truncate_seq,
1916				    ci->i_truncate_size, false);
1917	if (IS_ERR(req)) {
1918		err = PTR_ERR(req);
1919		goto out_unlock;
1920	}
1921
1922	pages[0] = folio_page(folio, 0);
1923	osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
1924
1925	{
1926		__le64 xattr_buf = cpu_to_le64(inline_version);
1927		err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
1928					    "inline_version", &xattr_buf,
1929					    sizeof(xattr_buf),
1930					    CEPH_OSD_CMPXATTR_OP_GT,
1931					    CEPH_OSD_CMPXATTR_MODE_U64);
1932		if (err)
1933			goto out_put_req;
1934	}
1935
1936	{
1937		char xattr_buf[32];
1938		int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
1939					 "%llu", inline_version);
1940		err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
1941					    "inline_version",
1942					    xattr_buf, xattr_len, 0, 0);
1943		if (err)
1944			goto out_put_req;
1945	}
1946
1947	req->r_mtime = inode_get_mtime(inode);
1948	ceph_osdc_start_request(&fsc->client->osdc, req);
1949	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1950
1951	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
1952				  req->r_end_latency, len, err);
1953
1954out_uninline:
1955	if (!err) {
1956		int dirty;
1957
1958		/* Set to CAP_INLINE_NONE and dirty the caps */
1959		down_read(&fsc->mdsc->snap_rwsem);
1960		spin_lock(&ci->i_ceph_lock);
1961		ci->i_inline_version = CEPH_INLINE_NONE;
1962		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
1963		spin_unlock(&ci->i_ceph_lock);
1964		up_read(&fsc->mdsc->snap_rwsem);
1965		if (dirty)
1966			__mark_inode_dirty(inode, dirty);
1967	}
1968out_put_req:
1969	ceph_osdc_put_request(req);
1970	if (err == -ECANCELED)
1971		err = 0;
1972out_unlock:
1973	if (folio) {
1974		folio_unlock(folio);
1975		folio_put(folio);
1976	}
1977out:
1978	ceph_free_cap_flush(prealloc_cf);
1979	doutc(cl, "%llx.%llx inline_version %llu = %d\n",
1980	      ceph_vinop(inode), inline_version, err);
1981	return err;
1982}
1983
1984static const struct vm_operations_struct ceph_vmops = {
1985	.fault		= ceph_filemap_fault,
1986	.page_mkwrite	= ceph_page_mkwrite,
1987};
1988
1989int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1990{
1991	struct address_space *mapping = file->f_mapping;
1992
1993	if (!mapping->a_ops->read_folio)
1994		return -ENOEXEC;
1995	vma->vm_ops = &ceph_vmops;
1996	return 0;
1997}
1998
1999enum {
2000	POOL_READ	= 1,
2001	POOL_WRITE	= 2,
2002};
2003
2004static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
2005				s64 pool, struct ceph_string *pool_ns)
2006{
2007	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
2008	struct ceph_mds_client *mdsc = fsc->mdsc;
2009	struct ceph_client *cl = fsc->client;
2010	struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
2011	struct rb_node **p, *parent;
2012	struct ceph_pool_perm *perm;
2013	struct page **pages;
2014	size_t pool_ns_len;
2015	int err = 0, err2 = 0, have = 0;
2016
2017	down_read(&mdsc->pool_perm_rwsem);
2018	p = &mdsc->pool_perm_tree.rb_node;
2019	while (*p) {
2020		perm = rb_entry(*p, struct ceph_pool_perm, node);
2021		if (pool < perm->pool)
2022			p = &(*p)->rb_left;
2023		else if (pool > perm->pool)
2024			p = &(*p)->rb_right;
2025		else {
2026			int ret = ceph_compare_string(pool_ns,
2027						perm->pool_ns,
2028						perm->pool_ns_len);
2029			if (ret < 0)
2030				p = &(*p)->rb_left;
2031			else if (ret > 0)
2032				p = &(*p)->rb_right;
2033			else {
2034				have = perm->perm;
2035				break;
2036			}
2037		}
2038	}
2039	up_read(&mdsc->pool_perm_rwsem);
2040	if (*p)
2041		goto out;
2042
2043	if (pool_ns)
2044		doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
2045		      (int)pool_ns->len, pool_ns->str);
2046	else
2047		doutc(cl, "pool %lld no perm cached\n", pool);
2048
2049	down_write(&mdsc->pool_perm_rwsem);
2050	p = &mdsc->pool_perm_tree.rb_node;
2051	parent = NULL;
2052	while (*p) {
2053		parent = *p;
2054		perm = rb_entry(parent, struct ceph_pool_perm, node);
2055		if (pool < perm->pool)
2056			p = &(*p)->rb_left;
2057		else if (pool > perm->pool)
2058			p = &(*p)->rb_right;
2059		else {
2060			int ret = ceph_compare_string(pool_ns,
2061						perm->pool_ns,
2062						perm->pool_ns_len);
2063			if (ret < 0)
2064				p = &(*p)->rb_left;
2065			else if (ret > 0)
2066				p = &(*p)->rb_right;
2067			else {
2068				have = perm->perm;
2069				break;
2070			}
2071		}
2072	}
2073	if (*p) {
2074		up_write(&mdsc->pool_perm_rwsem);
2075		goto out;
2076	}
2077
2078	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
2079					 1, false, GFP_NOFS);
2080	if (!rd_req) {
2081		err = -ENOMEM;
2082		goto out_unlock;
2083	}
2084
2085	rd_req->r_flags = CEPH_OSD_FLAG_READ;
2086	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
2087	rd_req->r_base_oloc.pool = pool;
2088	if (pool_ns)
2089		rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
2090	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
2091
2092	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
2093	if (err)
2094		goto out_unlock;
2095
2096	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
2097					 1, false, GFP_NOFS);
2098	if (!wr_req) {
2099		err = -ENOMEM;
2100		goto out_unlock;
2101	}
2102
2103	wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
2104	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
2105	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
2106	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
2107
2108	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
2109	if (err)
2110		goto out_unlock;
2111
2112	/* one page should be large enough for STAT data */
2113	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
2114	if (IS_ERR(pages)) {
2115		err = PTR_ERR(pages);
2116		goto out_unlock;
2117	}
2118
2119	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
2120				     0, false, true);
2121	ceph_osdc_start_request(&fsc->client->osdc, rd_req);
2122
2123	wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
2124	ceph_osdc_start_request(&fsc->client->osdc, wr_req);
2125
2126	err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
2127	err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
2128
2129	if (err >= 0 || err == -ENOENT)
2130		have |= POOL_READ;
2131	else if (err != -EPERM) {
2132		if (err == -EBLOCKLISTED)
2133			fsc->blocklisted = true;
2134		goto out_unlock;
2135	}
2136
2137	if (err2 == 0 || err2 == -EEXIST)
2138		have |= POOL_WRITE;
2139	else if (err2 != -EPERM) {
2140		if (err2 == -EBLOCKLISTED)
2141			fsc->blocklisted = true;
2142		err = err2;
2143		goto out_unlock;
2144	}
2145
2146	pool_ns_len = pool_ns ? pool_ns->len : 0;
2147	perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
2148	if (!perm) {
2149		err = -ENOMEM;
2150		goto out_unlock;
2151	}
2152
2153	perm->pool = pool;
2154	perm->perm = have;
2155	perm->pool_ns_len = pool_ns_len;
2156	if (pool_ns_len > 0)
2157		memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
2158	perm->pool_ns[pool_ns_len] = 0;
2159
2160	rb_link_node(&perm->node, parent, p);
2161	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
2162	err = 0;
2163out_unlock:
2164	up_write(&mdsc->pool_perm_rwsem);
2165
2166	ceph_osdc_put_request(rd_req);
2167	ceph_osdc_put_request(wr_req);
2168out:
2169	if (!err)
2170		err = have;
2171	if (pool_ns)
2172		doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
2173		      (int)pool_ns->len, pool_ns->str, err);
2174	else
2175		doutc(cl, "pool %lld result = %d\n", pool, err);
2176	return err;
2177}
2178
2179int ceph_pool_perm_check(struct inode *inode, int need)
2180{
2181	struct ceph_client *cl = ceph_inode_to_client(inode);
2182	struct ceph_inode_info *ci = ceph_inode(inode);
2183	struct ceph_string *pool_ns;
2184	s64 pool;
2185	int ret, flags;
2186
2187	/* Only need to do this for regular files */
2188	if (!S_ISREG(inode->i_mode))
2189		return 0;
2190
2191	if (ci->i_vino.snap != CEPH_NOSNAP) {
2192		/*
2193		 * Pool permission check needs to write to the first object.
2194		 * But for snapshot, head of the first object may have alread
2195		 * been deleted. Skip check to avoid creating orphan object.
2196		 */
2197		return 0;
2198	}
2199
2200	if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
2201				NOPOOLPERM))
2202		return 0;
2203
2204	spin_lock(&ci->i_ceph_lock);
2205	flags = ci->i_ceph_flags;
2206	pool = ci->i_layout.pool_id;
2207	spin_unlock(&ci->i_ceph_lock);
2208check:
2209	if (flags & CEPH_I_POOL_PERM) {
2210		if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
2211			doutc(cl, "pool %lld no read perm\n", pool);
2212			return -EPERM;
2213		}
2214		if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
2215			doutc(cl, "pool %lld no write perm\n", pool);
2216			return -EPERM;
2217		}
2218		return 0;
2219	}
2220
2221	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
2222	ret = __ceph_pool_perm_get(ci, pool, pool_ns);
2223	ceph_put_string(pool_ns);
2224	if (ret < 0)
2225		return ret;
2226
2227	flags = CEPH_I_POOL_PERM;
2228	if (ret & POOL_READ)
2229		flags |= CEPH_I_POOL_RD;
2230	if (ret & POOL_WRITE)
2231		flags |= CEPH_I_POOL_WR;
2232
2233	spin_lock(&ci->i_ceph_lock);
2234	if (pool == ci->i_layout.pool_id &&
2235	    pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
2236		ci->i_ceph_flags |= flags;
2237        } else {
2238		pool = ci->i_layout.pool_id;
2239		flags = ci->i_ceph_flags;
2240	}
2241	spin_unlock(&ci->i_ceph_lock);
2242	goto check;
2243}
2244
2245void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
2246{
2247	struct ceph_pool_perm *perm;
2248	struct rb_node *n;
2249
2250	while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
2251		n = rb_first(&mdsc->pool_perm_tree);
2252		perm = rb_entry(n, struct ceph_pool_perm, node);
2253		rb_erase(n, &mdsc->pool_perm_tree);
2254		kfree(perm);
2255	}
2256}