fs/fuse/file.c at v5.5-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / fuse / file.c
at v5.5-rc2 3382 lines 84 kB view raw
   1/*
   2  FUSE: Filesystem in Userspace
   3  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
   4
   5  This program can be distributed under the terms of the GNU GPL.
   6  See the file COPYING.
   7*/
   8
   9#include "fuse_i.h"
  10
  11#include <linux/pagemap.h>
  12#include <linux/slab.h>
  13#include <linux/kernel.h>
  14#include <linux/sched.h>
  15#include <linux/sched/signal.h>
  16#include <linux/module.h>
  17#include <linux/compat.h>
  18#include <linux/swap.h>
  19#include <linux/falloc.h>
  20#include <linux/uio.h>
  21
  22static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
  23				      struct fuse_page_desc **desc)
  24{
  25	struct page **pages;
  26
  27	pages = kzalloc(npages * (sizeof(struct page *) +
  28				  sizeof(struct fuse_page_desc)), flags);
  29	*desc = (void *) (pages + npages);
  30
  31	return pages;
  32}
  33
  34static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
  35			  int opcode, struct fuse_open_out *outargp)
  36{
  37	struct fuse_open_in inarg;
  38	FUSE_ARGS(args);
  39
  40	memset(&inarg, 0, sizeof(inarg));
  41	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
  42	if (!fc->atomic_o_trunc)
  43		inarg.flags &= ~O_TRUNC;
  44	args.opcode = opcode;
  45	args.nodeid = nodeid;
  46	args.in_numargs = 1;
  47	args.in_args[0].size = sizeof(inarg);
  48	args.in_args[0].value = &inarg;
  49	args.out_numargs = 1;
  50	args.out_args[0].size = sizeof(*outargp);
  51	args.out_args[0].value = outargp;
  52
  53	return fuse_simple_request(fc, &args);
  54}
  55
  56struct fuse_release_args {
  57	struct fuse_args args;
  58	struct fuse_release_in inarg;
  59	struct inode *inode;
  60};
  61
  62struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
  63{
  64	struct fuse_file *ff;
  65
  66	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
  67	if (unlikely(!ff))
  68		return NULL;
  69
  70	ff->fc = fc;
  71	ff->release_args = kzalloc(sizeof(*ff->release_args),
  72				   GFP_KERNEL_ACCOUNT);
  73	if (!ff->release_args) {
  74		kfree(ff);
  75		return NULL;
  76	}
  77
  78	INIT_LIST_HEAD(&ff->write_entry);
  79	mutex_init(&ff->readdir.lock);
  80	refcount_set(&ff->count, 1);
  81	RB_CLEAR_NODE(&ff->polled_node);
  82	init_waitqueue_head(&ff->poll_wait);
  83
  84	ff->kh = atomic64_inc_return(&fc->khctr);
  85
  86	return ff;
  87}
  88
  89void fuse_file_free(struct fuse_file *ff)
  90{
  91	kfree(ff->release_args);
  92	mutex_destroy(&ff->readdir.lock);
  93	kfree(ff);
  94}
  95
  96static struct fuse_file *fuse_file_get(struct fuse_file *ff)
  97{
  98	refcount_inc(&ff->count);
  99	return ff;
 100}
 101
 102static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
 103			     int error)
 104{
 105	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
 106
 107	iput(ra->inode);
 108	kfree(ra);
 109}
 110
 111static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 112{
 113	if (refcount_dec_and_test(&ff->count)) {
 114		struct fuse_args *args = &ff->release_args->args;
 115
 116		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
 117			/* Do nothing when client does not implement 'open' */
 118			fuse_release_end(ff->fc, args, 0);
 119		} else if (sync) {
 120			fuse_simple_request(ff->fc, args);
 121			fuse_release_end(ff->fc, args, 0);
 122		} else {
 123			args->end = fuse_release_end;
 124			if (fuse_simple_background(ff->fc, args,
 125						   GFP_KERNEL | __GFP_NOFAIL))
 126				fuse_release_end(ff->fc, args, -ENOTCONN);
 127		}
 128		kfree(ff);
 129	}
 130}
 131
 132int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 133		 bool isdir)
 134{
 135	struct fuse_file *ff;
 136	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 137
 138	ff = fuse_file_alloc(fc);
 139	if (!ff)
 140		return -ENOMEM;
 141
 142	ff->fh = 0;
 143	/* Default for no-open */
 144	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
 145	if (isdir ? !fc->no_opendir : !fc->no_open) {
 146		struct fuse_open_out outarg;
 147		int err;
 148
 149		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
 150		if (!err) {
 151			ff->fh = outarg.fh;
 152			ff->open_flags = outarg.open_flags;
 153
 154		} else if (err != -ENOSYS) {
 155			fuse_file_free(ff);
 156			return err;
 157		} else {
 158			if (isdir)
 159				fc->no_opendir = 1;
 160			else
 161				fc->no_open = 1;
 162		}
 163	}
 164
 165	if (isdir)
 166		ff->open_flags &= ~FOPEN_DIRECT_IO;
 167
 168	ff->nodeid = nodeid;
 169	file->private_data = ff;
 170
 171	return 0;
 172}
 173EXPORT_SYMBOL_GPL(fuse_do_open);
 174
 175static void fuse_link_write_file(struct file *file)
 176{
 177	struct inode *inode = file_inode(file);
 178	struct fuse_inode *fi = get_fuse_inode(inode);
 179	struct fuse_file *ff = file->private_data;
 180	/*
 181	 * file may be written through mmap, so chain it onto the
 182	 * inodes's write_file list
 183	 */
 184	spin_lock(&fi->lock);
 185	if (list_empty(&ff->write_entry))
 186		list_add(&ff->write_entry, &fi->write_files);
 187	spin_unlock(&fi->lock);
 188}
 189
 190void fuse_finish_open(struct inode *inode, struct file *file)
 191{
 192	struct fuse_file *ff = file->private_data;
 193	struct fuse_conn *fc = get_fuse_conn(inode);
 194
 195	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 196		invalidate_inode_pages2(inode->i_mapping);
 197	if (ff->open_flags & FOPEN_STREAM)
 198		stream_open(inode, file);
 199	else if (ff->open_flags & FOPEN_NONSEEKABLE)
 200		nonseekable_open(inode, file);
 201	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
 202		struct fuse_inode *fi = get_fuse_inode(inode);
 203
 204		spin_lock(&fi->lock);
 205		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 206		i_size_write(inode, 0);
 207		spin_unlock(&fi->lock);
 208		fuse_invalidate_attr(inode);
 209		if (fc->writeback_cache)
 210			file_update_time(file);
 211	}
 212	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 213		fuse_link_write_file(file);
 214}
 215
 216int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 217{
 218	struct fuse_conn *fc = get_fuse_conn(inode);
 219	int err;
 220	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
 221			  fc->atomic_o_trunc &&
 222			  fc->writeback_cache;
 223
 224	err = generic_file_open(inode, file);
 225	if (err)
 226		return err;
 227
 228	if (is_wb_truncate) {
 229		inode_lock(inode);
 230		fuse_set_nowrite(inode);
 231	}
 232
 233	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
 234
 235	if (!err)
 236		fuse_finish_open(inode, file);
 237
 238	if (is_wb_truncate) {
 239		fuse_release_nowrite(inode);
 240		inode_unlock(inode);
 241	}
 242
 243	return err;
 244}
 245
 246static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 247				 int flags, int opcode)
 248{
 249	struct fuse_conn *fc = ff->fc;
 250	struct fuse_release_args *ra = ff->release_args;
 251
 252	/* Inode is NULL on error path of fuse_create_open() */
 253	if (likely(fi)) {
 254		spin_lock(&fi->lock);
 255		list_del(&ff->write_entry);
 256		spin_unlock(&fi->lock);
 257	}
 258	spin_lock(&fc->lock);
 259	if (!RB_EMPTY_NODE(&ff->polled_node))
 260		rb_erase(&ff->polled_node, &fc->polled_files);
 261	spin_unlock(&fc->lock);
 262
 263	wake_up_interruptible_all(&ff->poll_wait);
 264
 265	ra->inarg.fh = ff->fh;
 266	ra->inarg.flags = flags;
 267	ra->args.in_numargs = 1;
 268	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
 269	ra->args.in_args[0].value = &ra->inarg;
 270	ra->args.opcode = opcode;
 271	ra->args.nodeid = ff->nodeid;
 272	ra->args.force = true;
 273	ra->args.nocreds = true;
 274}
 275
 276void fuse_release_common(struct file *file, bool isdir)
 277{
 278	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
 279	struct fuse_file *ff = file->private_data;
 280	struct fuse_release_args *ra = ff->release_args;
 281	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 282
 283	fuse_prepare_release(fi, ff, file->f_flags, opcode);
 284
 285	if (ff->flock) {
 286		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 287		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
 288							  (fl_owner_t) file);
 289	}
 290	/* Hold inode until release is finished */
 291	ra->inode = igrab(file_inode(file));
 292
 293	/*
 294	 * Normally this will send the RELEASE request, however if
 295	 * some asynchronous READ or WRITE requests are outstanding,
 296	 * the sending will be delayed.
 297	 *
 298	 * Make the release synchronous if this is a fuseblk mount,
 299	 * synchronous RELEASE is allowed (and desirable) in this case
 300	 * because the server can be trusted not to screw up.
 301	 */
 302	fuse_file_put(ff, ff->fc->destroy, isdir);
 303}
 304
 305static int fuse_open(struct inode *inode, struct file *file)
 306{
 307	return fuse_open_common(inode, file, false);
 308}
 309
 310static int fuse_release(struct inode *inode, struct file *file)
 311{
 312	struct fuse_conn *fc = get_fuse_conn(inode);
 313
 314	/* see fuse_vma_close() for !writeback_cache case */
 315	if (fc->writeback_cache)
 316		write_inode_now(inode, 1);
 317
 318	fuse_release_common(file, false);
 319
 320	/* return value is ignored by VFS */
 321	return 0;
 322}
 323
 324void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
 325{
 326	WARN_ON(refcount_read(&ff->count) > 1);
 327	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
 328	/*
 329	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
 330	 * synchronous, we are fine with not doing igrab() here"
 331	 */
 332	fuse_file_put(ff, true, false);
 333}
 334EXPORT_SYMBOL_GPL(fuse_sync_release);
 335
 336/*
 337 * Scramble the ID space with XTEA, so that the value of the files_struct
 338 * pointer is not exposed to userspace.
 339 */
 340u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 341{
 342	u32 *k = fc->scramble_key;
 343	u64 v = (unsigned long) id;
 344	u32 v0 = v;
 345	u32 v1 = v >> 32;
 346	u32 sum = 0;
 347	int i;
 348
 349	for (i = 0; i < 32; i++) {
 350		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
 351		sum += 0x9E3779B9;
 352		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
 353	}
 354
 355	return (u64) v0 + ((u64) v1 << 32);
 356}
 357
 358struct fuse_writepage_args {
 359	struct fuse_io_args ia;
 360	struct list_head writepages_entry;
 361	struct list_head queue_entry;
 362	struct fuse_writepage_args *next;
 363	struct inode *inode;
 364};
 365
 366static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 367					    pgoff_t idx_from, pgoff_t idx_to)
 368{
 369	struct fuse_writepage_args *wpa;
 370
 371	list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
 372		pgoff_t curr_index;
 373
 374		WARN_ON(get_fuse_inode(wpa->inode) != fi);
 375		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
 376		if (idx_from < curr_index + wpa->ia.ap.num_pages &&
 377		    curr_index <= idx_to) {
 378			return wpa;
 379		}
 380	}
 381	return NULL;
 382}
 383
 384/*
 385 * Check if any page in a range is under writeback
 386 *
 387 * This is currently done by walking the list of writepage requests
 388 * for the inode, which can be pretty inefficient.
 389 */
 390static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 391				   pgoff_t idx_to)
 392{
 393	struct fuse_inode *fi = get_fuse_inode(inode);
 394	bool found;
 395
 396	spin_lock(&fi->lock);
 397	found = fuse_find_writeback(fi, idx_from, idx_to);
 398	spin_unlock(&fi->lock);
 399
 400	return found;
 401}
 402
 403static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 404{
 405	return fuse_range_is_writeback(inode, index, index);
 406}
 407
 408/*
 409 * Wait for page writeback to be completed.
 410 *
 411 * Since fuse doesn't rely on the VM writeback tracking, this has to
 412 * use some other means.
 413 */
 414static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 415{
 416	struct fuse_inode *fi = get_fuse_inode(inode);
 417
 418	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
 419}
 420
 421/*
 422 * Wait for all pending writepages on the inode to finish.
 423 *
 424 * This is currently done by blocking further writes with FUSE_NOWRITE
 425 * and waiting for all sent writes to complete.
 426 *
 427 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 428 * could conflict with truncation.
 429 */
 430static void fuse_sync_writes(struct inode *inode)
 431{
 432	fuse_set_nowrite(inode);
 433	fuse_release_nowrite(inode);
 434}
 435
 436static int fuse_flush(struct file *file, fl_owner_t id)
 437{
 438	struct inode *inode = file_inode(file);
 439	struct fuse_conn *fc = get_fuse_conn(inode);
 440	struct fuse_file *ff = file->private_data;
 441	struct fuse_flush_in inarg;
 442	FUSE_ARGS(args);
 443	int err;
 444
 445	if (is_bad_inode(inode))
 446		return -EIO;
 447
 448	if (fc->no_flush)
 449		return 0;
 450
 451	err = write_inode_now(inode, 1);
 452	if (err)
 453		return err;
 454
 455	inode_lock(inode);
 456	fuse_sync_writes(inode);
 457	inode_unlock(inode);
 458
 459	err = filemap_check_errors(file->f_mapping);
 460	if (err)
 461		return err;
 462
 463	memset(&inarg, 0, sizeof(inarg));
 464	inarg.fh = ff->fh;
 465	inarg.lock_owner = fuse_lock_owner_id(fc, id);
 466	args.opcode = FUSE_FLUSH;
 467	args.nodeid = get_node_id(inode);
 468	args.in_numargs = 1;
 469	args.in_args[0].size = sizeof(inarg);
 470	args.in_args[0].value = &inarg;
 471	args.force = true;
 472
 473	err = fuse_simple_request(fc, &args);
 474	if (err == -ENOSYS) {
 475		fc->no_flush = 1;
 476		err = 0;
 477	}
 478	return err;
 479}
 480
 481int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 482		      int datasync, int opcode)
 483{
 484	struct inode *inode = file->f_mapping->host;
 485	struct fuse_conn *fc = get_fuse_conn(inode);
 486	struct fuse_file *ff = file->private_data;
 487	FUSE_ARGS(args);
 488	struct fuse_fsync_in inarg;
 489
 490	memset(&inarg, 0, sizeof(inarg));
 491	inarg.fh = ff->fh;
 492	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
 493	args.opcode = opcode;
 494	args.nodeid = get_node_id(inode);
 495	args.in_numargs = 1;
 496	args.in_args[0].size = sizeof(inarg);
 497	args.in_args[0].value = &inarg;
 498	return fuse_simple_request(fc, &args);
 499}
 500
 501static int fuse_fsync(struct file *file, loff_t start, loff_t end,
 502		      int datasync)
 503{
 504	struct inode *inode = file->f_mapping->host;
 505	struct fuse_conn *fc = get_fuse_conn(inode);
 506	int err;
 507
 508	if (is_bad_inode(inode))
 509		return -EIO;
 510
 511	inode_lock(inode);
 512
 513	/*
 514	 * Start writeback against all dirty pages of the inode, then
 515	 * wait for all outstanding writes, before sending the FSYNC
 516	 * request.
 517	 */
 518	err = file_write_and_wait_range(file, start, end);
 519	if (err)
 520		goto out;
 521
 522	fuse_sync_writes(inode);
 523
 524	/*
 525	 * Due to implementation of fuse writeback
 526	 * file_write_and_wait_range() does not catch errors.
 527	 * We have to do this directly after fuse_sync_writes()
 528	 */
 529	err = file_check_and_advance_wb_err(file);
 530	if (err)
 531		goto out;
 532
 533	err = sync_inode_metadata(inode, 1);
 534	if (err)
 535		goto out;
 536
 537	if (fc->no_fsync)
 538		goto out;
 539
 540	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
 541	if (err == -ENOSYS) {
 542		fc->no_fsync = 1;
 543		err = 0;
 544	}
 545out:
 546	inode_unlock(inode);
 547
 548	return err;
 549}
 550
 551void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 552			 size_t count, int opcode)
 553{
 554	struct fuse_file *ff = file->private_data;
 555	struct fuse_args *args = &ia->ap.args;
 556
 557	ia->read.in.fh = ff->fh;
 558	ia->read.in.offset = pos;
 559	ia->read.in.size = count;
 560	ia->read.in.flags = file->f_flags;
 561	args->opcode = opcode;
 562	args->nodeid = ff->nodeid;
 563	args->in_numargs = 1;
 564	args->in_args[0].size = sizeof(ia->read.in);
 565	args->in_args[0].value = &ia->read.in;
 566	args->out_argvar = true;
 567	args->out_numargs = 1;
 568	args->out_args[0].size = count;
 569}
 570
 571static void fuse_release_user_pages(struct fuse_args_pages *ap,
 572				    bool should_dirty)
 573{
 574	unsigned int i;
 575
 576	for (i = 0; i < ap->num_pages; i++) {
 577		if (should_dirty)
 578			set_page_dirty_lock(ap->pages[i]);
 579		put_page(ap->pages[i]);
 580	}
 581}
 582
 583static void fuse_io_release(struct kref *kref)
 584{
 585	kfree(container_of(kref, struct fuse_io_priv, refcnt));
 586}
 587
 588static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 589{
 590	if (io->err)
 591		return io->err;
 592
 593	if (io->bytes >= 0 && io->write)
 594		return -EIO;
 595
 596	return io->bytes < 0 ? io->size : io->bytes;
 597}
 598
 599/**
 600 * In case of short read, the caller sets 'pos' to the position of
 601 * actual end of fuse request in IO request. Otherwise, if bytes_requested
 602 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 603 *
 604 * An example:
 605 * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
 606 * both submitted asynchronously. The first of them was ACKed by userspace as
 607 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 608 * second request was ACKed as short, e.g. only 1K was read, resulting in
 609 * pos == 33K.
 610 *
 611 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 612 * will be equal to the length of the longest contiguous fragment of
 613 * transferred data starting from the beginning of IO request.
 614 */
 615static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 616{
 617	int left;
 618
 619	spin_lock(&io->lock);
 620	if (err)
 621		io->err = io->err ? : err;
 622	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
 623		io->bytes = pos;
 624
 625	left = --io->reqs;
 626	if (!left && io->blocking)
 627		complete(io->done);
 628	spin_unlock(&io->lock);
 629
 630	if (!left && !io->blocking) {
 631		ssize_t res = fuse_get_res_by_io(io);
 632
 633		if (res >= 0) {
 634			struct inode *inode = file_inode(io->iocb->ki_filp);
 635			struct fuse_conn *fc = get_fuse_conn(inode);
 636			struct fuse_inode *fi = get_fuse_inode(inode);
 637
 638			spin_lock(&fi->lock);
 639			fi->attr_version = atomic64_inc_return(&fc->attr_version);
 640			spin_unlock(&fi->lock);
 641		}
 642
 643		io->iocb->ki_complete(io->iocb, res, 0);
 644	}
 645
 646	kref_put(&io->refcnt, fuse_io_release);
 647}
 648
 649static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
 650					  unsigned int npages)
 651{
 652	struct fuse_io_args *ia;
 653
 654	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
 655	if (ia) {
 656		ia->io = io;
 657		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
 658						&ia->ap.descs);
 659		if (!ia->ap.pages) {
 660			kfree(ia);
 661			ia = NULL;
 662		}
 663	}
 664	return ia;
 665}
 666
 667static void fuse_io_free(struct fuse_io_args *ia)
 668{
 669	kfree(ia->ap.pages);
 670	kfree(ia);
 671}
 672
 673static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
 674				  int err)
 675{
 676	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 677	struct fuse_io_priv *io = ia->io;
 678	ssize_t pos = -1;
 679
 680	fuse_release_user_pages(&ia->ap, io->should_dirty);
 681
 682	if (err) {
 683		/* Nothing */
 684	} else if (io->write) {
 685		if (ia->write.out.size > ia->write.in.size) {
 686			err = -EIO;
 687		} else if (ia->write.in.size != ia->write.out.size) {
 688			pos = ia->write.in.offset - io->offset +
 689				ia->write.out.size;
 690		}
 691	} else {
 692		u32 outsize = args->out_args[0].size;
 693
 694		if (ia->read.in.size != outsize)
 695			pos = ia->read.in.offset - io->offset + outsize;
 696	}
 697
 698	fuse_aio_complete(io, err, pos);
 699	fuse_io_free(ia);
 700}
 701
 702static ssize_t fuse_async_req_send(struct fuse_conn *fc,
 703				   struct fuse_io_args *ia, size_t num_bytes)
 704{
 705	ssize_t err;
 706	struct fuse_io_priv *io = ia->io;
 707
 708	spin_lock(&io->lock);
 709	kref_get(&io->refcnt);
 710	io->size += num_bytes;
 711	io->reqs++;
 712	spin_unlock(&io->lock);
 713
 714	ia->ap.args.end = fuse_aio_complete_req;
 715	err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
 716	if (err)
 717		fuse_aio_complete_req(fc, &ia->ap.args, err);
 718
 719	return num_bytes;
 720}
 721
 722static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
 723			      fl_owner_t owner)
 724{
 725	struct file *file = ia->io->iocb->ki_filp;
 726	struct fuse_file *ff = file->private_data;
 727	struct fuse_conn *fc = ff->fc;
 728
 729	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 730	if (owner != NULL) {
 731		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
 732		ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
 733	}
 734
 735	if (ia->io->async)
 736		return fuse_async_req_send(fc, ia, count);
 737
 738	return fuse_simple_request(fc, &ia->ap.args);
 739}
 740
 741static void fuse_read_update_size(struct inode *inode, loff_t size,
 742				  u64 attr_ver)
 743{
 744	struct fuse_conn *fc = get_fuse_conn(inode);
 745	struct fuse_inode *fi = get_fuse_inode(inode);
 746
 747	spin_lock(&fi->lock);
 748	if (attr_ver == fi->attr_version && size < inode->i_size &&
 749	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 750		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 751		i_size_write(inode, size);
 752	}
 753	spin_unlock(&fi->lock);
 754}
 755
 756static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
 757			    struct fuse_args_pages *ap)
 758{
 759	struct fuse_conn *fc = get_fuse_conn(inode);
 760
 761	if (fc->writeback_cache) {
 762		/*
 763		 * A hole in a file. Some data after the hole are in page cache,
 764		 * but have not reached the client fs yet. So, the hole is not
 765		 * present there.
 766		 */
 767		int i;
 768		int start_idx = num_read >> PAGE_SHIFT;
 769		size_t off = num_read & (PAGE_SIZE - 1);
 770
 771		for (i = start_idx; i < ap->num_pages; i++) {
 772			zero_user_segment(ap->pages[i], off, PAGE_SIZE);
 773			off = 0;
 774		}
 775	} else {
 776		loff_t pos = page_offset(ap->pages[0]) + num_read;
 777		fuse_read_update_size(inode, pos, attr_ver);
 778	}
 779}
 780
 781static int fuse_do_readpage(struct file *file, struct page *page)
 782{
 783	struct inode *inode = page->mapping->host;
 784	struct fuse_conn *fc = get_fuse_conn(inode);
 785	loff_t pos = page_offset(page);
 786	struct fuse_page_desc desc = { .length = PAGE_SIZE };
 787	struct fuse_io_args ia = {
 788		.ap.args.page_zeroing = true,
 789		.ap.args.out_pages = true,
 790		.ap.num_pages = 1,
 791		.ap.pages = &page,
 792		.ap.descs = &desc,
 793	};
 794	ssize_t res;
 795	u64 attr_ver;
 796
 797	/*
 798	 * Page writeback can extend beyond the lifetime of the
 799	 * page-cache page, so make sure we read a properly synced
 800	 * page.
 801	 */
 802	fuse_wait_on_page_writeback(inode, page->index);
 803
 804	attr_ver = fuse_get_attr_version(fc);
 805
 806	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
 807	res = fuse_simple_request(fc, &ia.ap.args);
 808	if (res < 0)
 809		return res;
 810	/*
 811	 * Short read means EOF.  If file size is larger, truncate it
 812	 */
 813	if (res < desc.length)
 814		fuse_short_read(inode, attr_ver, res, &ia.ap);
 815
 816	SetPageUptodate(page);
 817
 818	return 0;
 819}
 820
 821static int fuse_readpage(struct file *file, struct page *page)
 822{
 823	struct inode *inode = page->mapping->host;
 824	int err;
 825
 826	err = -EIO;
 827	if (is_bad_inode(inode))
 828		goto out;
 829
 830	err = fuse_do_readpage(file, page);
 831	fuse_invalidate_atime(inode);
 832 out:
 833	unlock_page(page);
 834	return err;
 835}
 836
 837static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
 838			       int err)
 839{
 840	int i;
 841	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 842	struct fuse_args_pages *ap = &ia->ap;
 843	size_t count = ia->read.in.size;
 844	size_t num_read = args->out_args[0].size;
 845	struct address_space *mapping = NULL;
 846
 847	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
 848		mapping = ap->pages[i]->mapping;
 849
 850	if (mapping) {
 851		struct inode *inode = mapping->host;
 852
 853		/*
 854		 * Short read means EOF. If file size is larger, truncate it
 855		 */
 856		if (!err && num_read < count)
 857			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 858
 859		fuse_invalidate_atime(inode);
 860	}
 861
 862	for (i = 0; i < ap->num_pages; i++) {
 863		struct page *page = ap->pages[i];
 864
 865		if (!err)
 866			SetPageUptodate(page);
 867		else
 868			SetPageError(page);
 869		unlock_page(page);
 870		put_page(page);
 871	}
 872	if (ia->ff)
 873		fuse_file_put(ia->ff, false, false);
 874
 875	fuse_io_free(ia);
 876}
 877
 878static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 879{
 880	struct fuse_file *ff = file->private_data;
 881	struct fuse_conn *fc = ff->fc;
 882	struct fuse_args_pages *ap = &ia->ap;
 883	loff_t pos = page_offset(ap->pages[0]);
 884	size_t count = ap->num_pages << PAGE_SHIFT;
 885	int err;
 886
 887	ap->args.out_pages = true;
 888	ap->args.page_zeroing = true;
 889	ap->args.page_replace = true;
 890	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 891	ia->read.attr_ver = fuse_get_attr_version(fc);
 892	if (fc->async_read) {
 893		ia->ff = fuse_file_get(ff);
 894		ap->args.end = fuse_readpages_end;
 895		err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
 896		if (!err)
 897			return;
 898	} else {
 899		err = fuse_simple_request(fc, &ap->args);
 900	}
 901	fuse_readpages_end(fc, &ap->args, err);
 902}
 903
 904struct fuse_fill_data {
 905	struct fuse_io_args *ia;
 906	struct file *file;
 907	struct inode *inode;
 908	unsigned int nr_pages;
 909	unsigned int max_pages;
 910};
 911
 912static int fuse_readpages_fill(void *_data, struct page *page)
 913{
 914	struct fuse_fill_data *data = _data;
 915	struct fuse_io_args *ia = data->ia;
 916	struct fuse_args_pages *ap = &ia->ap;
 917	struct inode *inode = data->inode;
 918	struct fuse_conn *fc = get_fuse_conn(inode);
 919
 920	fuse_wait_on_page_writeback(inode, page->index);
 921
 922	if (ap->num_pages &&
 923	    (ap->num_pages == fc->max_pages ||
 924	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
 925	     ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
 926		data->max_pages = min_t(unsigned int, data->nr_pages,
 927					fc->max_pages);
 928		fuse_send_readpages(ia, data->file);
 929		data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
 930		if (!ia) {
 931			unlock_page(page);
 932			return -ENOMEM;
 933		}
 934		ap = &ia->ap;
 935	}
 936
 937	if (WARN_ON(ap->num_pages >= data->max_pages)) {
 938		unlock_page(page);
 939		fuse_io_free(ia);
 940		return -EIO;
 941	}
 942
 943	get_page(page);
 944	ap->pages[ap->num_pages] = page;
 945	ap->descs[ap->num_pages].length = PAGE_SIZE;
 946	ap->num_pages++;
 947	data->nr_pages--;
 948	return 0;
 949}
 950
 951static int fuse_readpages(struct file *file, struct address_space *mapping,
 952			  struct list_head *pages, unsigned nr_pages)
 953{
 954	struct inode *inode = mapping->host;
 955	struct fuse_conn *fc = get_fuse_conn(inode);
 956	struct fuse_fill_data data;
 957	int err;
 958
 959	err = -EIO;
 960	if (is_bad_inode(inode))
 961		goto out;
 962
 963	data.file = file;
 964	data.inode = inode;
 965	data.nr_pages = nr_pages;
 966	data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
 967;
 968	data.ia = fuse_io_alloc(NULL, data.max_pages);
 969	err = -ENOMEM;
 970	if (!data.ia)
 971		goto out;
 972
 973	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
 974	if (!err) {
 975		if (data.ia->ap.num_pages)
 976			fuse_send_readpages(data.ia, file);
 977		else
 978			fuse_io_free(data.ia);
 979	}
 980out:
 981	return err;
 982}
 983
 984static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
 985{
 986	struct inode *inode = iocb->ki_filp->f_mapping->host;
 987	struct fuse_conn *fc = get_fuse_conn(inode);
 988
 989	/*
 990	 * In auto invalidate mode, always update attributes on read.
 991	 * Otherwise, only update if we attempt to read past EOF (to ensure
 992	 * i_size is up to date).
 993	 */
 994	if (fc->auto_inval_data ||
 995	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
 996		int err;
 997		err = fuse_update_attributes(inode, iocb->ki_filp);
 998		if (err)
 999			return err;
1000	}
1001
1002	return generic_file_read_iter(iocb, to);
1003}
1004
1005static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1006				 loff_t pos, size_t count)
1007{
1008	struct fuse_args *args = &ia->ap.args;
1009
1010	ia->write.in.fh = ff->fh;
1011	ia->write.in.offset = pos;
1012	ia->write.in.size = count;
1013	args->opcode = FUSE_WRITE;
1014	args->nodeid = ff->nodeid;
1015	args->in_numargs = 2;
1016	if (ff->fc->minor < 9)
1017		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1018	else
1019		args->in_args[0].size = sizeof(ia->write.in);
1020	args->in_args[0].value = &ia->write.in;
1021	args->in_args[1].size = count;
1022	args->out_numargs = 1;
1023	args->out_args[0].size = sizeof(ia->write.out);
1024	args->out_args[0].value = &ia->write.out;
1025}
1026
1027static unsigned int fuse_write_flags(struct kiocb *iocb)
1028{
1029	unsigned int flags = iocb->ki_filp->f_flags;
1030
1031	if (iocb->ki_flags & IOCB_DSYNC)
1032		flags |= O_DSYNC;
1033	if (iocb->ki_flags & IOCB_SYNC)
1034		flags |= O_SYNC;
1035
1036	return flags;
1037}
1038
1039static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1040			       size_t count, fl_owner_t owner)
1041{
1042	struct kiocb *iocb = ia->io->iocb;
1043	struct file *file = iocb->ki_filp;
1044	struct fuse_file *ff = file->private_data;
1045	struct fuse_conn *fc = ff->fc;
1046	struct fuse_write_in *inarg = &ia->write.in;
1047	ssize_t err;
1048
1049	fuse_write_args_fill(ia, ff, pos, count);
1050	inarg->flags = fuse_write_flags(iocb);
1051	if (owner != NULL) {
1052		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1053		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
1054	}
1055
1056	if (ia->io->async)
1057		return fuse_async_req_send(fc, ia, count);
1058
1059	err = fuse_simple_request(fc, &ia->ap.args);
1060	if (!err && ia->write.out.size > count)
1061		err = -EIO;
1062
1063	return err ?: ia->write.out.size;
1064}
1065
1066bool fuse_write_update_size(struct inode *inode, loff_t pos)
1067{
1068	struct fuse_conn *fc = get_fuse_conn(inode);
1069	struct fuse_inode *fi = get_fuse_inode(inode);
1070	bool ret = false;
1071
1072	spin_lock(&fi->lock);
1073	fi->attr_version = atomic64_inc_return(&fc->attr_version);
1074	if (pos > inode->i_size) {
1075		i_size_write(inode, pos);
1076		ret = true;
1077	}
1078	spin_unlock(&fi->lock);
1079
1080	return ret;
1081}
1082
1083static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1084				     struct kiocb *iocb, struct inode *inode,
1085				     loff_t pos, size_t count)
1086{
1087	struct fuse_args_pages *ap = &ia->ap;
1088	struct file *file = iocb->ki_filp;
1089	struct fuse_file *ff = file->private_data;
1090	struct fuse_conn *fc = ff->fc;
1091	unsigned int offset, i;
1092	int err;
1093
1094	for (i = 0; i < ap->num_pages; i++)
1095		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
1096
1097	fuse_write_args_fill(ia, ff, pos, count);
1098	ia->write.in.flags = fuse_write_flags(iocb);
1099
1100	err = fuse_simple_request(fc, &ap->args);
1101	if (!err && ia->write.out.size > count)
1102		err = -EIO;
1103
1104	offset = ap->descs[0].offset;
1105	count = ia->write.out.size;
1106	for (i = 0; i < ap->num_pages; i++) {
1107		struct page *page = ap->pages[i];
1108
1109		if (!err && !offset && count >= PAGE_SIZE)
1110			SetPageUptodate(page);
1111
1112		if (count > PAGE_SIZE - offset)
1113			count -= PAGE_SIZE - offset;
1114		else
1115			count = 0;
1116		offset = 0;
1117
1118		unlock_page(page);
1119		put_page(page);
1120	}
1121
1122	return err;
1123}
1124
1125static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
1126				     struct address_space *mapping,
1127				     struct iov_iter *ii, loff_t pos,
1128				     unsigned int max_pages)
1129{
1130	struct fuse_conn *fc = get_fuse_conn(mapping->host);
1131	unsigned offset = pos & (PAGE_SIZE - 1);
1132	size_t count = 0;
1133	int err;
1134
1135	ap->args.in_pages = true;
1136	ap->descs[0].offset = offset;
1137
1138	do {
1139		size_t tmp;
1140		struct page *page;
1141		pgoff_t index = pos >> PAGE_SHIFT;
1142		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1143				     iov_iter_count(ii));
1144
1145		bytes = min_t(size_t, bytes, fc->max_write - count);
1146
1147 again:
1148		err = -EFAULT;
1149		if (iov_iter_fault_in_readable(ii, bytes))
1150			break;
1151
1152		err = -ENOMEM;
1153		page = grab_cache_page_write_begin(mapping, index, 0);
1154		if (!page)
1155			break;
1156
1157		if (mapping_writably_mapped(mapping))
1158			flush_dcache_page(page);
1159
1160		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
1161		flush_dcache_page(page);
1162
1163		iov_iter_advance(ii, tmp);
1164		if (!tmp) {
1165			unlock_page(page);
1166			put_page(page);
1167			bytes = min(bytes, iov_iter_single_seg_count(ii));
1168			goto again;
1169		}
1170
1171		err = 0;
1172		ap->pages[ap->num_pages] = page;
1173		ap->descs[ap->num_pages].length = tmp;
1174		ap->num_pages++;
1175
1176		count += tmp;
1177		pos += tmp;
1178		offset += tmp;
1179		if (offset == PAGE_SIZE)
1180			offset = 0;
1181
1182		if (!fc->big_writes)
1183			break;
1184	} while (iov_iter_count(ii) && count < fc->max_write &&
1185		 ap->num_pages < max_pages && offset == 0);
1186
1187	return count > 0 ? count : err;
1188}
1189
1190static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1191				     unsigned int max_pages)
1192{
1193	return min_t(unsigned int,
1194		     ((pos + len - 1) >> PAGE_SHIFT) -
1195		     (pos >> PAGE_SHIFT) + 1,
1196		     max_pages);
1197}
1198
1199static ssize_t fuse_perform_write(struct kiocb *iocb,
1200				  struct address_space *mapping,
1201				  struct iov_iter *ii, loff_t pos)
1202{
1203	struct inode *inode = mapping->host;
1204	struct fuse_conn *fc = get_fuse_conn(inode);
1205	struct fuse_inode *fi = get_fuse_inode(inode);
1206	int err = 0;
1207	ssize_t res = 0;
1208
1209	if (inode->i_size < pos + iov_iter_count(ii))
1210		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1211
1212	do {
1213		ssize_t count;
1214		struct fuse_io_args ia = {};
1215		struct fuse_args_pages *ap = &ia.ap;
1216		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1217						      fc->max_pages);
1218
1219		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1220		if (!ap->pages) {
1221			err = -ENOMEM;
1222			break;
1223		}
1224
1225		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
1226		if (count <= 0) {
1227			err = count;
1228		} else {
1229			err = fuse_send_write_pages(&ia, iocb, inode,
1230						    pos, count);
1231			if (!err) {
1232				size_t num_written = ia.write.out.size;
1233
1234				res += num_written;
1235				pos += num_written;
1236
1237				/* break out of the loop on short write */
1238				if (num_written != count)
1239					err = -EIO;
1240			}
1241		}
1242		kfree(ap->pages);
1243	} while (!err && iov_iter_count(ii));
1244
1245	if (res > 0)
1246		fuse_write_update_size(inode, pos);
1247
1248	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1249	fuse_invalidate_attr(inode);
1250
1251	return res > 0 ? res : err;
1252}
1253
1254static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1255{
1256	struct file *file = iocb->ki_filp;
1257	struct address_space *mapping = file->f_mapping;
1258	ssize_t written = 0;
1259	ssize_t written_buffered = 0;
1260	struct inode *inode = mapping->host;
1261	ssize_t err;
1262	loff_t endbyte = 0;
1263
1264	if (get_fuse_conn(inode)->writeback_cache) {
1265		/* Update size (EOF optimization) and mode (SUID clearing) */
1266		err = fuse_update_attributes(mapping->host, file);
1267		if (err)
1268			return err;
1269
1270		return generic_file_write_iter(iocb, from);
1271	}
1272
1273	inode_lock(inode);
1274
1275	/* We can write back this queue in page reclaim */
1276	current->backing_dev_info = inode_to_bdi(inode);
1277
1278	err = generic_write_checks(iocb, from);
1279	if (err <= 0)
1280		goto out;
1281
1282	err = file_remove_privs(file);
1283	if (err)
1284		goto out;
1285
1286	err = file_update_time(file);
1287	if (err)
1288		goto out;
1289
1290	if (iocb->ki_flags & IOCB_DIRECT) {
1291		loff_t pos = iocb->ki_pos;
1292		written = generic_file_direct_write(iocb, from);
1293		if (written < 0 || !iov_iter_count(from))
1294			goto out;
1295
1296		pos += written;
1297
1298		written_buffered = fuse_perform_write(iocb, mapping, from, pos);
1299		if (written_buffered < 0) {
1300			err = written_buffered;
1301			goto out;
1302		}
1303		endbyte = pos + written_buffered - 1;
1304
1305		err = filemap_write_and_wait_range(file->f_mapping, pos,
1306						   endbyte);
1307		if (err)
1308			goto out;
1309
1310		invalidate_mapping_pages(file->f_mapping,
1311					 pos >> PAGE_SHIFT,
1312					 endbyte >> PAGE_SHIFT);
1313
1314		written += written_buffered;
1315		iocb->ki_pos = pos + written_buffered;
1316	} else {
1317		written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
1318		if (written >= 0)
1319			iocb->ki_pos += written;
1320	}
1321out:
1322	current->backing_dev_info = NULL;
1323	inode_unlock(inode);
1324	if (written > 0)
1325		written = generic_write_sync(iocb, written);
1326
1327	return written ? written : err;
1328}
1329
1330static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
1331					       unsigned int index,
1332					       unsigned int nr_pages)
1333{
1334	int i;
1335
1336	for (i = index; i < index + nr_pages; i++)
1337		descs[i].length = PAGE_SIZE - descs[i].offset;
1338}
1339
1340static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1341{
1342	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
1343}
1344
1345static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1346					size_t max_size)
1347{
1348	return min(iov_iter_single_seg_count(ii), max_size);
1349}
1350
1351static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1352			       size_t *nbytesp, int write,
1353			       unsigned int max_pages)
1354{
1355	size_t nbytes = 0;  /* # bytes already packed in req */
1356	ssize_t ret = 0;
1357
1358	/* Special case for kernel I/O: can copy directly into the buffer */
1359	if (iov_iter_is_kvec(ii)) {
1360		unsigned long user_addr = fuse_get_user_addr(ii);
1361		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1362
1363		if (write)
1364			ap->args.in_args[1].value = (void *) user_addr;
1365		else
1366			ap->args.out_args[0].value = (void *) user_addr;
1367
1368		iov_iter_advance(ii, frag_size);
1369		*nbytesp = frag_size;
1370		return 0;
1371	}
1372
1373	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1374		unsigned npages;
1375		size_t start;
1376		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
1377					*nbytesp - nbytes,
1378					max_pages - ap->num_pages,
1379					&start);
1380		if (ret < 0)
1381			break;
1382
1383		iov_iter_advance(ii, ret);
1384		nbytes += ret;
1385
1386		ret += start;
1387		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1388
1389		ap->descs[ap->num_pages].offset = start;
1390		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1391
1392		ap->num_pages += npages;
1393		ap->descs[ap->num_pages - 1].length -=
1394			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1395	}
1396
1397	if (write)
1398		ap->args.in_pages = 1;
1399	else
1400		ap->args.out_pages = 1;
1401
1402	*nbytesp = nbytes;
1403
1404	return ret < 0 ? ret : 0;
1405}
1406
1407ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1408		       loff_t *ppos, int flags)
1409{
1410	int write = flags & FUSE_DIO_WRITE;
1411	int cuse = flags & FUSE_DIO_CUSE;
1412	struct file *file = io->iocb->ki_filp;
1413	struct inode *inode = file->f_mapping->host;
1414	struct fuse_file *ff = file->private_data;
1415	struct fuse_conn *fc = ff->fc;
1416	size_t nmax = write ? fc->max_write : fc->max_read;
1417	loff_t pos = *ppos;
1418	size_t count = iov_iter_count(iter);
1419	pgoff_t idx_from = pos >> PAGE_SHIFT;
1420	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1421	ssize_t res = 0;
1422	int err = 0;
1423	struct fuse_io_args *ia;
1424	unsigned int max_pages;
1425
1426	max_pages = iov_iter_npages(iter, fc->max_pages);
1427	ia = fuse_io_alloc(io, max_pages);
1428	if (!ia)
1429		return -ENOMEM;
1430
1431	ia->io = io;
1432	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1433		if (!write)
1434			inode_lock(inode);
1435		fuse_sync_writes(inode);
1436		if (!write)
1437			inode_unlock(inode);
1438	}
1439
1440	io->should_dirty = !write && iter_is_iovec(iter);
1441	while (count) {
1442		ssize_t nres;
1443		fl_owner_t owner = current->files;
1444		size_t nbytes = min(count, nmax);
1445
1446		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1447					  max_pages);
1448		if (err && !nbytes)
1449			break;
1450
1451		if (write) {
1452			if (!capable(CAP_FSETID))
1453				ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
1454
1455			nres = fuse_send_write(ia, pos, nbytes, owner);
1456		} else {
1457			nres = fuse_send_read(ia, pos, nbytes, owner);
1458		}
1459
1460		if (!io->async || nres < 0) {
1461			fuse_release_user_pages(&ia->ap, io->should_dirty);
1462			fuse_io_free(ia);
1463		}
1464		ia = NULL;
1465		if (nres < 0) {
1466			err = nres;
1467			break;
1468		}
1469		WARN_ON(nres > nbytes);
1470
1471		count -= nres;
1472		res += nres;
1473		pos += nres;
1474		if (nres != nbytes)
1475			break;
1476		if (count) {
1477			max_pages = iov_iter_npages(iter, fc->max_pages);
1478			ia = fuse_io_alloc(io, max_pages);
1479			if (!ia)
1480				break;
1481		}
1482	}
1483	if (ia)
1484		fuse_io_free(ia);
1485	if (res > 0)
1486		*ppos = pos;
1487
1488	return res > 0 ? res : err;
1489}
1490EXPORT_SYMBOL_GPL(fuse_direct_io);
1491
1492static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1493				  struct iov_iter *iter,
1494				  loff_t *ppos)
1495{
1496	ssize_t res;
1497	struct inode *inode = file_inode(io->iocb->ki_filp);
1498
1499	res = fuse_direct_io(io, iter, ppos, 0);
1500
1501	fuse_invalidate_atime(inode);
1502
1503	return res;
1504}
1505
1506static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1507
1508static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1509{
1510	ssize_t res;
1511
1512	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1513		res = fuse_direct_IO(iocb, to);
1514	} else {
1515		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1516
1517		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1518	}
1519
1520	return res;
1521}
1522
1523static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1524{
1525	struct inode *inode = file_inode(iocb->ki_filp);
1526	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1527	ssize_t res;
1528
1529	/* Don't allow parallel writes to the same file */
1530	inode_lock(inode);
1531	res = generic_write_checks(iocb, from);
1532	if (res > 0) {
1533		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1534			res = fuse_direct_IO(iocb, from);
1535		} else {
1536			res = fuse_direct_io(&io, from, &iocb->ki_pos,
1537					     FUSE_DIO_WRITE);
1538		}
1539	}
1540	fuse_invalidate_attr(inode);
1541	if (res > 0)
1542		fuse_write_update_size(inode, iocb->ki_pos);
1543	inode_unlock(inode);
1544
1545	return res;
1546}
1547
1548static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1549{
1550	struct file *file = iocb->ki_filp;
1551	struct fuse_file *ff = file->private_data;
1552
1553	if (is_bad_inode(file_inode(file)))
1554		return -EIO;
1555
1556	if (!(ff->open_flags & FOPEN_DIRECT_IO))
1557		return fuse_cache_read_iter(iocb, to);
1558	else
1559		return fuse_direct_read_iter(iocb, to);
1560}
1561
1562static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1563{
1564	struct file *file = iocb->ki_filp;
1565	struct fuse_file *ff = file->private_data;
1566
1567	if (is_bad_inode(file_inode(file)))
1568		return -EIO;
1569
1570	if (!(ff->open_flags & FOPEN_DIRECT_IO))
1571		return fuse_cache_write_iter(iocb, from);
1572	else
1573		return fuse_direct_write_iter(iocb, from);
1574}
1575
1576static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1577{
1578	struct fuse_args_pages *ap = &wpa->ia.ap;
1579	int i;
1580
1581	for (i = 0; i < ap->num_pages; i++)
1582		__free_page(ap->pages[i]);
1583
1584	if (wpa->ia.ff)
1585		fuse_file_put(wpa->ia.ff, false, false);
1586
1587	kfree(ap->pages);
1588	kfree(wpa);
1589}
1590
1591static void fuse_writepage_finish(struct fuse_conn *fc,
1592				  struct fuse_writepage_args *wpa)
1593{
1594	struct fuse_args_pages *ap = &wpa->ia.ap;
1595	struct inode *inode = wpa->inode;
1596	struct fuse_inode *fi = get_fuse_inode(inode);
1597	struct backing_dev_info *bdi = inode_to_bdi(inode);
1598	int i;
1599
1600	list_del(&wpa->writepages_entry);
1601	for (i = 0; i < ap->num_pages; i++) {
1602		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1603		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1604		wb_writeout_inc(&bdi->wb);
1605	}
1606	wake_up(&fi->page_waitq);
1607}
1608
1609/* Called under fi->lock, may release and reacquire it */
1610static void fuse_send_writepage(struct fuse_conn *fc,
1611				struct fuse_writepage_args *wpa, loff_t size)
1612__releases(fi->lock)
1613__acquires(fi->lock)
1614{
1615	struct fuse_writepage_args *aux, *next;
1616	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1617	struct fuse_write_in *inarg = &wpa->ia.write.in;
1618	struct fuse_args *args = &wpa->ia.ap.args;
1619	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1620	int err;
1621
1622	fi->writectr++;
1623	if (inarg->offset + data_size <= size) {
1624		inarg->size = data_size;
1625	} else if (inarg->offset < size) {
1626		inarg->size = size - inarg->offset;
1627	} else {
1628		/* Got truncated off completely */
1629		goto out_free;
1630	}
1631
1632	args->in_args[1].size = inarg->size;
1633	args->force = true;
1634	args->nocreds = true;
1635
1636	err = fuse_simple_background(fc, args, GFP_ATOMIC);
1637	if (err == -ENOMEM) {
1638		spin_unlock(&fi->lock);
1639		err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
1640		spin_lock(&fi->lock);
1641	}
1642
1643	/* Fails on broken connection only */
1644	if (unlikely(err))
1645		goto out_free;
1646
1647	return;
1648
1649 out_free:
1650	fi->writectr--;
1651	fuse_writepage_finish(fc, wpa);
1652	spin_unlock(&fi->lock);
1653
1654	/* After fuse_writepage_finish() aux request list is private */
1655	for (aux = wpa->next; aux; aux = next) {
1656		next = aux->next;
1657		aux->next = NULL;
1658		fuse_writepage_free(aux);
1659	}
1660
1661	fuse_writepage_free(wpa);
1662	spin_lock(&fi->lock);
1663}
1664
1665/*
1666 * If fi->writectr is positive (no truncate or fsync going on) send
1667 * all queued writepage requests.
1668 *
1669 * Called with fi->lock
1670 */
1671void fuse_flush_writepages(struct inode *inode)
1672__releases(fi->lock)
1673__acquires(fi->lock)
1674{
1675	struct fuse_conn *fc = get_fuse_conn(inode);
1676	struct fuse_inode *fi = get_fuse_inode(inode);
1677	loff_t crop = i_size_read(inode);
1678	struct fuse_writepage_args *wpa;
1679
1680	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1681		wpa = list_entry(fi->queued_writes.next,
1682				 struct fuse_writepage_args, queue_entry);
1683		list_del_init(&wpa->queue_entry);
1684		fuse_send_writepage(fc, wpa, crop);
1685	}
1686}
1687
1688static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
1689			       int error)
1690{
1691	struct fuse_writepage_args *wpa =
1692		container_of(args, typeof(*wpa), ia.ap.args);
1693	struct inode *inode = wpa->inode;
1694	struct fuse_inode *fi = get_fuse_inode(inode);
1695
1696	mapping_set_error(inode->i_mapping, error);
1697	spin_lock(&fi->lock);
1698	while (wpa->next) {
1699		struct fuse_conn *fc = get_fuse_conn(inode);
1700		struct fuse_write_in *inarg = &wpa->ia.write.in;
1701		struct fuse_writepage_args *next = wpa->next;
1702
1703		wpa->next = next->next;
1704		next->next = NULL;
1705		next->ia.ff = fuse_file_get(wpa->ia.ff);
1706		list_add(&next->writepages_entry, &fi->writepages);
1707
1708		/*
1709		 * Skip fuse_flush_writepages() to make it easy to crop requests
1710		 * based on primary request size.
1711		 *
1712		 * 1st case (trivial): there are no concurrent activities using
1713		 * fuse_set/release_nowrite.  Then we're on safe side because
1714		 * fuse_flush_writepages() would call fuse_send_writepage()
1715		 * anyway.
1716		 *
1717		 * 2nd case: someone called fuse_set_nowrite and it is waiting
1718		 * now for completion of all in-flight requests.  This happens
1719		 * rarely and no more than once per page, so this should be
1720		 * okay.
1721		 *
1722		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
1723		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
1724		 * that fuse_set_nowrite returned implies that all in-flight
1725		 * requests were completed along with all of their secondary
1726		 * requests.  Further primary requests are blocked by negative
1727		 * writectr.  Hence there cannot be any in-flight requests and
1728		 * no invocations of fuse_writepage_end() while we're in
1729		 * fuse_set_nowrite..fuse_release_nowrite section.
1730		 */
1731		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
1732	}
1733	fi->writectr--;
1734	fuse_writepage_finish(fc, wpa);
1735	spin_unlock(&fi->lock);
1736	fuse_writepage_free(wpa);
1737}
1738
1739static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
1740					       struct fuse_inode *fi)
1741{
1742	struct fuse_file *ff = NULL;
1743
1744	spin_lock(&fi->lock);
1745	if (!list_empty(&fi->write_files)) {
1746		ff = list_entry(fi->write_files.next, struct fuse_file,
1747				write_entry);
1748		fuse_file_get(ff);
1749	}
1750	spin_unlock(&fi->lock);
1751
1752	return ff;
1753}
1754
1755static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
1756					     struct fuse_inode *fi)
1757{
1758	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
1759	WARN_ON(!ff);
1760	return ff;
1761}
1762
1763int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1764{
1765	struct fuse_conn *fc = get_fuse_conn(inode);
1766	struct fuse_inode *fi = get_fuse_inode(inode);
1767	struct fuse_file *ff;
1768	int err;
1769
1770	ff = __fuse_write_file_get(fc, fi);
1771	err = fuse_flush_times(inode, ff);
1772	if (ff)
1773		fuse_file_put(ff, false, false);
1774
1775	return err;
1776}
1777
1778static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
1779{
1780	struct fuse_writepage_args *wpa;
1781	struct fuse_args_pages *ap;
1782
1783	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
1784	if (wpa) {
1785		ap = &wpa->ia.ap;
1786		ap->num_pages = 0;
1787		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
1788		if (!ap->pages) {
1789			kfree(wpa);
1790			wpa = NULL;
1791		}
1792	}
1793	return wpa;
1794
1795}
1796
1797static int fuse_writepage_locked(struct page *page)
1798{
1799	struct address_space *mapping = page->mapping;
1800	struct inode *inode = mapping->host;
1801	struct fuse_conn *fc = get_fuse_conn(inode);
1802	struct fuse_inode *fi = get_fuse_inode(inode);
1803	struct fuse_writepage_args *wpa;
1804	struct fuse_args_pages *ap;
1805	struct page *tmp_page;
1806	int error = -ENOMEM;
1807
1808	set_page_writeback(page);
1809
1810	wpa = fuse_writepage_args_alloc();
1811	if (!wpa)
1812		goto err;
1813	ap = &wpa->ia.ap;
1814
1815	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1816	if (!tmp_page)
1817		goto err_free;
1818
1819	error = -EIO;
1820	wpa->ia.ff = fuse_write_file_get(fc, fi);
1821	if (!wpa->ia.ff)
1822		goto err_nofile;
1823
1824	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
1825
1826	copy_highpage(tmp_page, page);
1827	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
1828	wpa->next = NULL;
1829	ap->args.in_pages = true;
1830	ap->num_pages = 1;
1831	ap->pages[0] = tmp_page;
1832	ap->descs[0].offset = 0;
1833	ap->descs[0].length = PAGE_SIZE;
1834	ap->args.end = fuse_writepage_end;
1835	wpa->inode = inode;
1836
1837	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1838	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1839
1840	spin_lock(&fi->lock);
1841	list_add(&wpa->writepages_entry, &fi->writepages);
1842	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1843	fuse_flush_writepages(inode);
1844	spin_unlock(&fi->lock);
1845
1846	end_page_writeback(page);
1847
1848	return 0;
1849
1850err_nofile:
1851	__free_page(tmp_page);
1852err_free:
1853	kfree(wpa);
1854err:
1855	mapping_set_error(page->mapping, error);
1856	end_page_writeback(page);
1857	return error;
1858}
1859
1860static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1861{
1862	int err;
1863
1864	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
1865		/*
1866		 * ->writepages() should be called for sync() and friends.  We
1867		 * should only get here on direct reclaim and then we are
1868		 * allowed to skip a page which is already in flight
1869		 */
1870		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
1871
1872		redirty_page_for_writepage(wbc, page);
1873		unlock_page(page);
1874		return 0;
1875	}
1876
1877	err = fuse_writepage_locked(page);
1878	unlock_page(page);
1879
1880	return err;
1881}
1882
1883struct fuse_fill_wb_data {
1884	struct fuse_writepage_args *wpa;
1885	struct fuse_file *ff;
1886	struct inode *inode;
1887	struct page **orig_pages;
1888	unsigned int max_pages;
1889};
1890
1891static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
1892{
1893	struct fuse_args_pages *ap = &data->wpa->ia.ap;
1894	struct fuse_conn *fc = get_fuse_conn(data->inode);
1895	struct page **pages;
1896	struct fuse_page_desc *descs;
1897	unsigned int npages = min_t(unsigned int,
1898				    max_t(unsigned int, data->max_pages * 2,
1899					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
1900				    fc->max_pages);
1901	WARN_ON(npages <= data->max_pages);
1902
1903	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
1904	if (!pages)
1905		return false;
1906
1907	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
1908	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
1909	kfree(ap->pages);
1910	ap->pages = pages;
1911	ap->descs = descs;
1912	data->max_pages = npages;
1913
1914	return true;
1915}
1916
1917static void fuse_writepages_send(struct fuse_fill_wb_data *data)
1918{
1919	struct fuse_writepage_args *wpa = data->wpa;
1920	struct inode *inode = data->inode;
1921	struct fuse_inode *fi = get_fuse_inode(inode);
1922	int num_pages = wpa->ia.ap.num_pages;
1923	int i;
1924
1925	wpa->ia.ff = fuse_file_get(data->ff);
1926	spin_lock(&fi->lock);
1927	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1928	fuse_flush_writepages(inode);
1929	spin_unlock(&fi->lock);
1930
1931	for (i = 0; i < num_pages; i++)
1932		end_page_writeback(data->orig_pages[i]);
1933}
1934
1935/*
1936 * First recheck under fi->lock if the offending offset is still under
1937 * writeback.  If yes, then iterate auxiliary write requests, to see if there's
1938 * one already added for a page at this offset.  If there's none, then insert
1939 * this new request onto the auxiliary list, otherwise reuse the existing one by
1940 * copying the new page contents over to the old temporary page.
1941 */
1942static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
1943				     struct page *page)
1944{
1945	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
1946	struct fuse_writepage_args *tmp;
1947	struct fuse_writepage_args *old_wpa;
1948	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
1949
1950	WARN_ON(new_ap->num_pages != 0);
1951
1952	spin_lock(&fi->lock);
1953	list_del(&new_wpa->writepages_entry);
1954	old_wpa = fuse_find_writeback(fi, page->index, page->index);
1955	if (!old_wpa) {
1956		list_add(&new_wpa->writepages_entry, &fi->writepages);
1957		spin_unlock(&fi->lock);
1958		return false;
1959	}
1960
1961	new_ap->num_pages = 1;
1962	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
1963		pgoff_t curr_index;
1964
1965		WARN_ON(tmp->inode != new_wpa->inode);
1966		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
1967		if (curr_index == page->index) {
1968			WARN_ON(tmp->ia.ap.num_pages != 1);
1969			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
1970			break;
1971		}
1972	}
1973
1974	if (!tmp) {
1975		new_wpa->next = old_wpa->next;
1976		old_wpa->next = new_wpa;
1977	}
1978
1979	spin_unlock(&fi->lock);
1980
1981	if (tmp) {
1982		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
1983
1984		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1985		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
1986		wb_writeout_inc(&bdi->wb);
1987		fuse_writepage_free(new_wpa);
1988	}
1989
1990	return true;
1991}
1992
1993static int fuse_writepages_fill(struct page *page,
1994		struct writeback_control *wbc, void *_data)
1995{
1996	struct fuse_fill_wb_data *data = _data;
1997	struct fuse_writepage_args *wpa = data->wpa;
1998	struct fuse_args_pages *ap = &wpa->ia.ap;
1999	struct inode *inode = data->inode;
2000	struct fuse_inode *fi = get_fuse_inode(inode);
2001	struct fuse_conn *fc = get_fuse_conn(inode);
2002	struct page *tmp_page;
2003	bool is_writeback;
2004	int err;
2005
2006	if (!data->ff) {
2007		err = -EIO;
2008		data->ff = fuse_write_file_get(fc, fi);
2009		if (!data->ff)
2010			goto out_unlock;
2011	}
2012
2013	/*
2014	 * Being under writeback is unlikely but possible.  For example direct
2015	 * read to an mmaped fuse file will set the page dirty twice; once when
2016	 * the pages are faulted with get_user_pages(), and then after the read
2017	 * completed.
2018	 */
2019	is_writeback = fuse_page_is_writeback(inode, page->index);
2020
2021	if (wpa && ap->num_pages &&
2022	    (is_writeback || ap->num_pages == fc->max_pages ||
2023	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
2024	     data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
2025		fuse_writepages_send(data);
2026		data->wpa = NULL;
2027	} else if (wpa && ap->num_pages == data->max_pages) {
2028		if (!fuse_pages_realloc(data)) {
2029			fuse_writepages_send(data);
2030			data->wpa = NULL;
2031		}
2032	}
2033
2034	err = -ENOMEM;
2035	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2036	if (!tmp_page)
2037		goto out_unlock;
2038
2039	/*
2040	 * The page must not be redirtied until the writeout is completed
2041	 * (i.e. userspace has sent a reply to the write request).  Otherwise
2042	 * there could be more than one temporary page instance for each real
2043	 * page.
2044	 *
2045	 * This is ensured by holding the page lock in page_mkwrite() while
2046	 * checking fuse_page_is_writeback().  We already hold the page lock
2047	 * since clear_page_dirty_for_io() and keep it held until we add the
2048	 * request to the fi->writepages list and increment ap->num_pages.
2049	 * After this fuse_page_is_writeback() will indicate that the page is
2050	 * under writeback, so we can release the page lock.
2051	 */
2052	if (data->wpa == NULL) {
2053		err = -ENOMEM;
2054		wpa = fuse_writepage_args_alloc();
2055		if (!wpa) {
2056			__free_page(tmp_page);
2057			goto out_unlock;
2058		}
2059		data->max_pages = 1;
2060
2061		ap = &wpa->ia.ap;
2062		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
2063		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2064		wpa->next = NULL;
2065		ap->args.in_pages = true;
2066		ap->args.end = fuse_writepage_end;
2067		ap->num_pages = 0;
2068		wpa->inode = inode;
2069
2070		spin_lock(&fi->lock);
2071		list_add(&wpa->writepages_entry, &fi->writepages);
2072		spin_unlock(&fi->lock);
2073
2074		data->wpa = wpa;
2075	}
2076	set_page_writeback(page);
2077
2078	copy_highpage(tmp_page, page);
2079	ap->pages[ap->num_pages] = tmp_page;
2080	ap->descs[ap->num_pages].offset = 0;
2081	ap->descs[ap->num_pages].length = PAGE_SIZE;
2082
2083	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2084	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2085
2086	err = 0;
2087	if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
2088		end_page_writeback(page);
2089		data->wpa = NULL;
2090		goto out_unlock;
2091	}
2092	data->orig_pages[ap->num_pages] = page;
2093
2094	/*
2095	 * Protected by fi->lock against concurrent access by
2096	 * fuse_page_is_writeback().
2097	 */
2098	spin_lock(&fi->lock);
2099	ap->num_pages++;
2100	spin_unlock(&fi->lock);
2101
2102out_unlock:
2103	unlock_page(page);
2104
2105	return err;
2106}
2107
2108static int fuse_writepages(struct address_space *mapping,
2109			   struct writeback_control *wbc)
2110{
2111	struct inode *inode = mapping->host;
2112	struct fuse_conn *fc = get_fuse_conn(inode);
2113	struct fuse_fill_wb_data data;
2114	int err;
2115
2116	err = -EIO;
2117	if (is_bad_inode(inode))
2118		goto out;
2119
2120	data.inode = inode;
2121	data.wpa = NULL;
2122	data.ff = NULL;
2123
2124	err = -ENOMEM;
2125	data.orig_pages = kcalloc(fc->max_pages,
2126				  sizeof(struct page *),
2127				  GFP_NOFS);
2128	if (!data.orig_pages)
2129		goto out;
2130
2131	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2132	if (data.wpa) {
2133		/* Ignore errors if we can write at least one page */
2134		WARN_ON(!data.wpa->ia.ap.num_pages);
2135		fuse_writepages_send(&data);
2136		err = 0;
2137	}
2138	if (data.ff)
2139		fuse_file_put(data.ff, false, false);
2140
2141	kfree(data.orig_pages);
2142out:
2143	return err;
2144}
2145
2146/*
2147 * It's worthy to make sure that space is reserved on disk for the write,
2148 * but how to implement it without killing performance need more thinking.
2149 */
2150static int fuse_write_begin(struct file *file, struct address_space *mapping,
2151		loff_t pos, unsigned len, unsigned flags,
2152		struct page **pagep, void **fsdata)
2153{
2154	pgoff_t index = pos >> PAGE_SHIFT;
2155	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2156	struct page *page;
2157	loff_t fsize;
2158	int err = -ENOMEM;
2159
2160	WARN_ON(!fc->writeback_cache);
2161
2162	page = grab_cache_page_write_begin(mapping, index, flags);
2163	if (!page)
2164		goto error;
2165
2166	fuse_wait_on_page_writeback(mapping->host, page->index);
2167
2168	if (PageUptodate(page) || len == PAGE_SIZE)
2169		goto success;
2170	/*
2171	 * Check if the start this page comes after the end of file, in which
2172	 * case the readpage can be optimized away.
2173	 */
2174	fsize = i_size_read(mapping->host);
2175	if (fsize <= (pos & PAGE_MASK)) {
2176		size_t off = pos & ~PAGE_MASK;
2177		if (off)
2178			zero_user_segment(page, 0, off);
2179		goto success;
2180	}
2181	err = fuse_do_readpage(file, page);
2182	if (err)
2183		goto cleanup;
2184success:
2185	*pagep = page;
2186	return 0;
2187
2188cleanup:
2189	unlock_page(page);
2190	put_page(page);
2191error:
2192	return err;
2193}
2194
2195static int fuse_write_end(struct file *file, struct address_space *mapping,
2196		loff_t pos, unsigned len, unsigned copied,
2197		struct page *page, void *fsdata)
2198{
2199	struct inode *inode = page->mapping->host;
2200
2201	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2202	if (!copied)
2203		goto unlock;
2204
2205	if (!PageUptodate(page)) {
2206		/* Zero any unwritten bytes at the end of the page */
2207		size_t endoff = (pos + copied) & ~PAGE_MASK;
2208		if (endoff)
2209			zero_user_segment(page, endoff, PAGE_SIZE);
2210		SetPageUptodate(page);
2211	}
2212
2213	fuse_write_update_size(inode, pos + copied);
2214	set_page_dirty(page);
2215
2216unlock:
2217	unlock_page(page);
2218	put_page(page);
2219
2220	return copied;
2221}
2222
2223static int fuse_launder_page(struct page *page)
2224{
2225	int err = 0;
2226	if (clear_page_dirty_for_io(page)) {
2227		struct inode *inode = page->mapping->host;
2228		err = fuse_writepage_locked(page);
2229		if (!err)
2230			fuse_wait_on_page_writeback(inode, page->index);
2231	}
2232	return err;
2233}
2234
2235/*
2236 * Write back dirty pages now, because there may not be any suitable
2237 * open files later
2238 */
2239static void fuse_vma_close(struct vm_area_struct *vma)
2240{
2241	filemap_write_and_wait(vma->vm_file->f_mapping);
2242}
2243
2244/*
2245 * Wait for writeback against this page to complete before allowing it
2246 * to be marked dirty again, and hence written back again, possibly
2247 * before the previous writepage completed.
2248 *
2249 * Block here, instead of in ->writepage(), so that the userspace fs
2250 * can only block processes actually operating on the filesystem.
2251 *
2252 * Otherwise unprivileged userspace fs would be able to block
2253 * unrelated:
2254 *
2255 * - page migration
2256 * - sync(2)
2257 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2258 */
2259static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2260{
2261	struct page *page = vmf->page;
2262	struct inode *inode = file_inode(vmf->vma->vm_file);
2263
2264	file_update_time(vmf->vma->vm_file);
2265	lock_page(page);
2266	if (page->mapping != inode->i_mapping) {
2267		unlock_page(page);
2268		return VM_FAULT_NOPAGE;
2269	}
2270
2271	fuse_wait_on_page_writeback(inode, page->index);
2272	return VM_FAULT_LOCKED;
2273}
2274
2275static const struct vm_operations_struct fuse_file_vm_ops = {
2276	.close		= fuse_vma_close,
2277	.fault		= filemap_fault,
2278	.map_pages	= filemap_map_pages,
2279	.page_mkwrite	= fuse_page_mkwrite,
2280};
2281
2282static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2283{
2284	struct fuse_file *ff = file->private_data;
2285
2286	if (ff->open_flags & FOPEN_DIRECT_IO) {
2287		/* Can't provide the coherency needed for MAP_SHARED */
2288		if (vma->vm_flags & VM_MAYSHARE)
2289			return -ENODEV;
2290
2291		invalidate_inode_pages2(file->f_mapping);
2292
2293		return generic_file_mmap(file, vma);
2294	}
2295
2296	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2297		fuse_link_write_file(file);
2298
2299	file_accessed(file);
2300	vma->vm_ops = &fuse_file_vm_ops;
2301	return 0;
2302}
2303
2304static int convert_fuse_file_lock(struct fuse_conn *fc,
2305				  const struct fuse_file_lock *ffl,
2306				  struct file_lock *fl)
2307{
2308	switch (ffl->type) {
2309	case F_UNLCK:
2310		break;
2311
2312	case F_RDLCK:
2313	case F_WRLCK:
2314		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2315		    ffl->end < ffl->start)
2316			return -EIO;
2317
2318		fl->fl_start = ffl->start;
2319		fl->fl_end = ffl->end;
2320
2321		/*
2322		 * Convert pid into init's pid namespace.  The locks API will
2323		 * translate it into the caller's pid namespace.
2324		 */
2325		rcu_read_lock();
2326		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2327		rcu_read_unlock();
2328		break;
2329
2330	default:
2331		return -EIO;
2332	}
2333	fl->fl_type = ffl->type;
2334	return 0;
2335}
2336
2337static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2338			 const struct file_lock *fl, int opcode, pid_t pid,
2339			 int flock, struct fuse_lk_in *inarg)
2340{
2341	struct inode *inode = file_inode(file);
2342	struct fuse_conn *fc = get_fuse_conn(inode);
2343	struct fuse_file *ff = file->private_data;
2344
2345	memset(inarg, 0, sizeof(*inarg));
2346	inarg->fh = ff->fh;
2347	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
2348	inarg->lk.start = fl->fl_start;
2349	inarg->lk.end = fl->fl_end;
2350	inarg->lk.type = fl->fl_type;
2351	inarg->lk.pid = pid;
2352	if (flock)
2353		inarg->lk_flags |= FUSE_LK_FLOCK;
2354	args->opcode = opcode;
2355	args->nodeid = get_node_id(inode);
2356	args->in_numargs = 1;
2357	args->in_args[0].size = sizeof(*inarg);
2358	args->in_args[0].value = inarg;
2359}
2360
2361static int fuse_getlk(struct file *file, struct file_lock *fl)
2362{
2363	struct inode *inode = file_inode(file);
2364	struct fuse_conn *fc = get_fuse_conn(inode);
2365	FUSE_ARGS(args);
2366	struct fuse_lk_in inarg;
2367	struct fuse_lk_out outarg;
2368	int err;
2369
2370	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2371	args.out_numargs = 1;
2372	args.out_args[0].size = sizeof(outarg);
2373	args.out_args[0].value = &outarg;
2374	err = fuse_simple_request(fc, &args);
2375	if (!err)
2376		err = convert_fuse_file_lock(fc, &outarg.lk, fl);
2377
2378	return err;
2379}
2380
2381static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2382{
2383	struct inode *inode = file_inode(file);
2384	struct fuse_conn *fc = get_fuse_conn(inode);
2385	FUSE_ARGS(args);
2386	struct fuse_lk_in inarg;
2387	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2388	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
2389	pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
2390	int err;
2391
2392	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2393		/* NLM needs asynchronous locks, which we don't support yet */
2394		return -ENOLCK;
2395	}
2396
2397	/* Unlock on close is handled by the flush method */
2398	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
2399		return 0;
2400
2401	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2402	err = fuse_simple_request(fc, &args);
2403
2404	/* locking is restartable */
2405	if (err == -EINTR)
2406		err = -ERESTARTSYS;
2407
2408	return err;
2409}
2410
2411static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2412{
2413	struct inode *inode = file_inode(file);
2414	struct fuse_conn *fc = get_fuse_conn(inode);
2415	int err;
2416
2417	if (cmd == F_CANCELLK) {
2418		err = 0;
2419	} else if (cmd == F_GETLK) {
2420		if (fc->no_lock) {
2421			posix_test_lock(file, fl);
2422			err = 0;
2423		} else
2424			err = fuse_getlk(file, fl);
2425	} else {
2426		if (fc->no_lock)
2427			err = posix_lock_file(file, fl, NULL);
2428		else
2429			err = fuse_setlk(file, fl, 0);
2430	}
2431	return err;
2432}
2433
2434static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2435{
2436	struct inode *inode = file_inode(file);
2437	struct fuse_conn *fc = get_fuse_conn(inode);
2438	int err;
2439
2440	if (fc->no_flock) {
2441		err = locks_lock_file_wait(file, fl);
2442	} else {
2443		struct fuse_file *ff = file->private_data;
2444
2445		/* emulate flock with POSIX locks */
2446		ff->flock = true;
2447		err = fuse_setlk(file, fl, 1);
2448	}
2449
2450	return err;
2451}
2452
2453static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2454{
2455	struct inode *inode = mapping->host;
2456	struct fuse_conn *fc = get_fuse_conn(inode);
2457	FUSE_ARGS(args);
2458	struct fuse_bmap_in inarg;
2459	struct fuse_bmap_out outarg;
2460	int err;
2461
2462	if (!inode->i_sb->s_bdev || fc->no_bmap)
2463		return 0;
2464
2465	memset(&inarg, 0, sizeof(inarg));
2466	inarg.block = block;
2467	inarg.blocksize = inode->i_sb->s_blocksize;
2468	args.opcode = FUSE_BMAP;
2469	args.nodeid = get_node_id(inode);
2470	args.in_numargs = 1;
2471	args.in_args[0].size = sizeof(inarg);
2472	args.in_args[0].value = &inarg;
2473	args.out_numargs = 1;
2474	args.out_args[0].size = sizeof(outarg);
2475	args.out_args[0].value = &outarg;
2476	err = fuse_simple_request(fc, &args);
2477	if (err == -ENOSYS)
2478		fc->no_bmap = 1;
2479
2480	return err ? 0 : outarg.block;
2481}
2482
2483static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2484{
2485	struct inode *inode = file->f_mapping->host;
2486	struct fuse_conn *fc = get_fuse_conn(inode);
2487	struct fuse_file *ff = file->private_data;
2488	FUSE_ARGS(args);
2489	struct fuse_lseek_in inarg = {
2490		.fh = ff->fh,
2491		.offset = offset,
2492		.whence = whence
2493	};
2494	struct fuse_lseek_out outarg;
2495	int err;
2496
2497	if (fc->no_lseek)
2498		goto fallback;
2499
2500	args.opcode = FUSE_LSEEK;
2501	args.nodeid = ff->nodeid;
2502	args.in_numargs = 1;
2503	args.in_args[0].size = sizeof(inarg);
2504	args.in_args[0].value = &inarg;
2505	args.out_numargs = 1;
2506	args.out_args[0].size = sizeof(outarg);
2507	args.out_args[0].value = &outarg;
2508	err = fuse_simple_request(fc, &args);
2509	if (err) {
2510		if (err == -ENOSYS) {
2511			fc->no_lseek = 1;
2512			goto fallback;
2513		}
2514		return err;
2515	}
2516
2517	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2518
2519fallback:
2520	err = fuse_update_attributes(inode, file);
2521	if (!err)
2522		return generic_file_llseek(file, offset, whence);
2523	else
2524		return err;
2525}
2526
2527static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2528{
2529	loff_t retval;
2530	struct inode *inode = file_inode(file);
2531
2532	switch (whence) {
2533	case SEEK_SET:
2534	case SEEK_CUR:
2535		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2536		retval = generic_file_llseek(file, offset, whence);
2537		break;
2538	case SEEK_END:
2539		inode_lock(inode);
2540		retval = fuse_update_attributes(inode, file);
2541		if (!retval)
2542			retval = generic_file_llseek(file, offset, whence);
2543		inode_unlock(inode);
2544		break;
2545	case SEEK_HOLE:
2546	case SEEK_DATA:
2547		inode_lock(inode);
2548		retval = fuse_lseek(file, offset, whence);
2549		inode_unlock(inode);
2550		break;
2551	default:
2552		retval = -EINVAL;
2553	}
2554
2555	return retval;
2556}
2557
2558/*
2559 * CUSE servers compiled on 32bit broke on 64bit kernels because the
2560 * ABI was defined to be 'struct iovec' which is different on 32bit
2561 * and 64bit.  Fortunately we can determine which structure the server
2562 * used from the size of the reply.
2563 */
2564static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
2565				     size_t transferred, unsigned count,
2566				     bool is_compat)
2567{
2568#ifdef CONFIG_COMPAT
2569	if (count * sizeof(struct compat_iovec) == transferred) {
2570		struct compat_iovec *ciov = src;
2571		unsigned i;
2572
2573		/*
2574		 * With this interface a 32bit server cannot support
2575		 * non-compat (i.e. ones coming from 64bit apps) ioctl
2576		 * requests
2577		 */
2578		if (!is_compat)
2579			return -EINVAL;
2580
2581		for (i = 0; i < count; i++) {
2582			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
2583			dst[i].iov_len = ciov[i].iov_len;
2584		}
2585		return 0;
2586	}
2587#endif
2588
2589	if (count * sizeof(struct iovec) != transferred)
2590		return -EIO;
2591
2592	memcpy(dst, src, transferred);
2593	return 0;
2594}
2595
2596/* Make sure iov_length() won't overflow */
2597static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
2598				 size_t count)
2599{
2600	size_t n;
2601	u32 max = fc->max_pages << PAGE_SHIFT;
2602
2603	for (n = 0; n < count; n++, iov++) {
2604		if (iov->iov_len > (size_t) max)
2605			return -ENOMEM;
2606		max -= iov->iov_len;
2607	}
2608	return 0;
2609}
2610
2611static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
2612				 void *src, size_t transferred, unsigned count,
2613				 bool is_compat)
2614{
2615	unsigned i;
2616	struct fuse_ioctl_iovec *fiov = src;
2617
2618	if (fc->minor < 16) {
2619		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
2620						 count, is_compat);
2621	}
2622
2623	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
2624		return -EIO;
2625
2626	for (i = 0; i < count; i++) {
2627		/* Did the server supply an inappropriate value? */
2628		if (fiov[i].base != (unsigned long) fiov[i].base ||
2629		    fiov[i].len != (unsigned long) fiov[i].len)
2630			return -EIO;
2631
2632		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
2633		dst[i].iov_len = (size_t) fiov[i].len;
2634
2635#ifdef CONFIG_COMPAT
2636		if (is_compat &&
2637		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
2638		     (compat_size_t) dst[i].iov_len != fiov[i].len))
2639			return -EIO;
2640#endif
2641	}
2642
2643	return 0;
2644}
2645
2646
2647/*
2648 * For ioctls, there is no generic way to determine how much memory
2649 * needs to be read and/or written.  Furthermore, ioctls are allowed
2650 * to dereference the passed pointer, so the parameter requires deep
2651 * copying but FUSE has no idea whatsoever about what to copy in or
2652 * out.
2653 *
2654 * This is solved by allowing FUSE server to retry ioctl with
2655 * necessary in/out iovecs.  Let's assume the ioctl implementation
2656 * needs to read in the following structure.
2657 *
2658 * struct a {
2659 *	char	*buf;
2660 *	size_t	buflen;
2661 * }
2662 *
2663 * On the first callout to FUSE server, inarg->in_size and
2664 * inarg->out_size will be NULL; then, the server completes the ioctl
2665 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
2666 * the actual iov array to
2667 *
2668 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
2669 *
2670 * which tells FUSE to copy in the requested area and retry the ioctl.
2671 * On the second round, the server has access to the structure and
2672 * from that it can tell what to look for next, so on the invocation,
2673 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
2674 *
2675 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
2676 *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
2677 *
2678 * FUSE will copy both struct a and the pointed buffer from the
2679 * process doing the ioctl and retry ioctl with both struct a and the
2680 * buffer.
2681 *
2682 * This time, FUSE server has everything it needs and completes ioctl
2683 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
2684 *
2685 * Copying data out works the same way.
2686 *
2687 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
2688 * automatically initializes in and out iovs by decoding @cmd with
2689 * _IOC_* macros and the server is not allowed to request RETRY.  This
2690 * limits ioctl data transfers to well-formed ioctls and is the forced
2691 * behavior for all FUSE servers.
2692 */
2693long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2694		   unsigned int flags)
2695{
2696	struct fuse_file *ff = file->private_data;
2697	struct fuse_conn *fc = ff->fc;
2698	struct fuse_ioctl_in inarg = {
2699		.fh = ff->fh,
2700		.cmd = cmd,
2701		.arg = arg,
2702		.flags = flags
2703	};
2704	struct fuse_ioctl_out outarg;
2705	struct iovec *iov_page = NULL;
2706	struct iovec *in_iov = NULL, *out_iov = NULL;
2707	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
2708	size_t in_size, out_size, c;
2709	ssize_t transferred;
2710	int err, i;
2711	struct iov_iter ii;
2712	struct fuse_args_pages ap = {};
2713
2714#if BITS_PER_LONG == 32
2715	inarg.flags |= FUSE_IOCTL_32BIT;
2716#else
2717	if (flags & FUSE_IOCTL_COMPAT) {
2718		inarg.flags |= FUSE_IOCTL_32BIT;
2719#ifdef CONFIG_X86_X32
2720		if (in_x32_syscall())
2721			inarg.flags |= FUSE_IOCTL_COMPAT_X32;
2722#endif
2723	}
2724#endif
2725
2726	/* assume all the iovs returned by client always fits in a page */
2727	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
2728
2729	err = -ENOMEM;
2730	ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
2731	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2732	if (!ap.pages || !iov_page)
2733		goto out;
2734
2735	fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
2736
2737	/*
2738	 * If restricted, initialize IO parameters as encoded in @cmd.
2739	 * RETRY from server is not allowed.
2740	 */
2741	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
2742		struct iovec *iov = iov_page;
2743
2744		iov->iov_base = (void __user *)arg;
2745		iov->iov_len = _IOC_SIZE(cmd);
2746
2747		if (_IOC_DIR(cmd) & _IOC_WRITE) {
2748			in_iov = iov;
2749			in_iovs = 1;
2750		}
2751
2752		if (_IOC_DIR(cmd) & _IOC_READ) {
2753			out_iov = iov;
2754			out_iovs = 1;
2755		}
2756	}
2757
2758 retry:
2759	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
2760	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
2761
2762	/*
2763	 * Out data can be used either for actual out data or iovs,
2764	 * make sure there always is at least one page.
2765	 */
2766	out_size = max_t(size_t, out_size, PAGE_SIZE);
2767	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
2768
2769	/* make sure there are enough buffer pages and init request with them */
2770	err = -ENOMEM;
2771	if (max_pages > fc->max_pages)
2772		goto out;
2773	while (ap.num_pages < max_pages) {
2774		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2775		if (!ap.pages[ap.num_pages])
2776			goto out;
2777		ap.num_pages++;
2778	}
2779
2780
2781	/* okay, let's send it to the client */
2782	ap.args.opcode = FUSE_IOCTL;
2783	ap.args.nodeid = ff->nodeid;
2784	ap.args.in_numargs = 1;
2785	ap.args.in_args[0].size = sizeof(inarg);
2786	ap.args.in_args[0].value = &inarg;
2787	if (in_size) {
2788		ap.args.in_numargs++;
2789		ap.args.in_args[1].size = in_size;
2790		ap.args.in_pages = true;
2791
2792		err = -EFAULT;
2793		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
2794		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2795			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2796			if (c != PAGE_SIZE && iov_iter_count(&ii))
2797				goto out;
2798		}
2799	}
2800
2801	ap.args.out_numargs = 2;
2802	ap.args.out_args[0].size = sizeof(outarg);
2803	ap.args.out_args[0].value = &outarg;
2804	ap.args.out_args[1].size = out_size;
2805	ap.args.out_pages = true;
2806	ap.args.out_argvar = true;
2807
2808	transferred = fuse_simple_request(fc, &ap.args);
2809	err = transferred;
2810	if (transferred < 0)
2811		goto out;
2812
2813	/* did it ask for retry? */
2814	if (outarg.flags & FUSE_IOCTL_RETRY) {
2815		void *vaddr;
2816
2817		/* no retry if in restricted mode */
2818		err = -EIO;
2819		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
2820			goto out;
2821
2822		in_iovs = outarg.in_iovs;
2823		out_iovs = outarg.out_iovs;
2824
2825		/*
2826		 * Make sure things are in boundary, separate checks
2827		 * are to protect against overflow.
2828		 */
2829		err = -ENOMEM;
2830		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
2831		    out_iovs > FUSE_IOCTL_MAX_IOV ||
2832		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
2833			goto out;
2834
2835		vaddr = kmap_atomic(ap.pages[0]);
2836		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
2837					    transferred, in_iovs + out_iovs,
2838					    (flags & FUSE_IOCTL_COMPAT) != 0);
2839		kunmap_atomic(vaddr);
2840		if (err)
2841			goto out;
2842
2843		in_iov = iov_page;
2844		out_iov = in_iov + in_iovs;
2845
2846		err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
2847		if (err)
2848			goto out;
2849
2850		err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
2851		if (err)
2852			goto out;
2853
2854		goto retry;
2855	}
2856
2857	err = -EIO;
2858	if (transferred > inarg.out_size)
2859		goto out;
2860
2861	err = -EFAULT;
2862	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
2863	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2864		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2865		if (c != PAGE_SIZE && iov_iter_count(&ii))
2866			goto out;
2867	}
2868	err = 0;
2869 out:
2870	free_page((unsigned long) iov_page);
2871	while (ap.num_pages)
2872		__free_page(ap.pages[--ap.num_pages]);
2873	kfree(ap.pages);
2874
2875	return err ? err : outarg.result;
2876}
2877EXPORT_SYMBOL_GPL(fuse_do_ioctl);
2878
2879long fuse_ioctl_common(struct file *file, unsigned int cmd,
2880		       unsigned long arg, unsigned int flags)
2881{
2882	struct inode *inode = file_inode(file);
2883	struct fuse_conn *fc = get_fuse_conn(inode);
2884
2885	if (!fuse_allow_current_process(fc))
2886		return -EACCES;
2887
2888	if (is_bad_inode(inode))
2889		return -EIO;
2890
2891	return fuse_do_ioctl(file, cmd, arg, flags);
2892}
2893
2894static long fuse_file_ioctl(struct file *file, unsigned int cmd,
2895			    unsigned long arg)
2896{
2897	return fuse_ioctl_common(file, cmd, arg, 0);
2898}
2899
2900static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
2901				   unsigned long arg)
2902{
2903	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
2904}
2905
2906/*
2907 * All files which have been polled are linked to RB tree
2908 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2909 * find the matching one.
2910 */
2911static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2912					      struct rb_node **parent_out)
2913{
2914	struct rb_node **link = &fc->polled_files.rb_node;
2915	struct rb_node *last = NULL;
2916
2917	while (*link) {
2918		struct fuse_file *ff;
2919
2920		last = *link;
2921		ff = rb_entry(last, struct fuse_file, polled_node);
2922
2923		if (kh < ff->kh)
2924			link = &last->rb_left;
2925		else if (kh > ff->kh)
2926			link = &last->rb_right;
2927		else
2928			return link;
2929	}
2930
2931	if (parent_out)
2932		*parent_out = last;
2933	return link;
2934}
2935
2936/*
2937 * The file is about to be polled.  Make sure it's on the polled_files
2938 * RB tree.  Note that files once added to the polled_files tree are
2939 * not removed before the file is released.  This is because a file
2940 * polled once is likely to be polled again.
2941 */
2942static void fuse_register_polled_file(struct fuse_conn *fc,
2943				      struct fuse_file *ff)
2944{
2945	spin_lock(&fc->lock);
2946	if (RB_EMPTY_NODE(&ff->polled_node)) {
2947		struct rb_node **link, *uninitialized_var(parent);
2948
2949		link = fuse_find_polled_node(fc, ff->kh, &parent);
2950		BUG_ON(*link);
2951		rb_link_node(&ff->polled_node, parent, link);
2952		rb_insert_color(&ff->polled_node, &fc->polled_files);
2953	}
2954	spin_unlock(&fc->lock);
2955}
2956
2957__poll_t fuse_file_poll(struct file *file, poll_table *wait)
2958{
2959	struct fuse_file *ff = file->private_data;
2960	struct fuse_conn *fc = ff->fc;
2961	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2962	struct fuse_poll_out outarg;
2963	FUSE_ARGS(args);
2964	int err;
2965
2966	if (fc->no_poll)
2967		return DEFAULT_POLLMASK;
2968
2969	poll_wait(file, &ff->poll_wait, wait);
2970	inarg.events = mangle_poll(poll_requested_events(wait));
2971
2972	/*
2973	 * Ask for notification iff there's someone waiting for it.
2974	 * The client may ignore the flag and always notify.
2975	 */
2976	if (waitqueue_active(&ff->poll_wait)) {
2977		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2978		fuse_register_polled_file(fc, ff);
2979	}
2980
2981	args.opcode = FUSE_POLL;
2982	args.nodeid = ff->nodeid;
2983	args.in_numargs = 1;
2984	args.in_args[0].size = sizeof(inarg);
2985	args.in_args[0].value = &inarg;
2986	args.out_numargs = 1;
2987	args.out_args[0].size = sizeof(outarg);
2988	args.out_args[0].value = &outarg;
2989	err = fuse_simple_request(fc, &args);
2990
2991	if (!err)
2992		return demangle_poll(outarg.revents);
2993	if (err == -ENOSYS) {
2994		fc->no_poll = 1;
2995		return DEFAULT_POLLMASK;
2996	}
2997	return EPOLLERR;
2998}
2999EXPORT_SYMBOL_GPL(fuse_file_poll);
3000
3001/*
3002 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
3003 * wakes up the poll waiters.
3004 */
3005int fuse_notify_poll_wakeup(struct fuse_conn *fc,
3006			    struct fuse_notify_poll_wakeup_out *outarg)
3007{
3008	u64 kh = outarg->kh;
3009	struct rb_node **link;
3010
3011	spin_lock(&fc->lock);
3012
3013	link = fuse_find_polled_node(fc, kh, NULL);
3014	if (*link) {
3015		struct fuse_file *ff;
3016
3017		ff = rb_entry(*link, struct fuse_file, polled_node);
3018		wake_up_interruptible_sync(&ff->poll_wait);
3019	}
3020
3021	spin_unlock(&fc->lock);
3022	return 0;
3023}
3024
3025static void fuse_do_truncate(struct file *file)
3026{
3027	struct inode *inode = file->f_mapping->host;
3028	struct iattr attr;
3029
3030	attr.ia_valid = ATTR_SIZE;
3031	attr.ia_size = i_size_read(inode);
3032
3033	attr.ia_file = file;
3034	attr.ia_valid |= ATTR_FILE;
3035
3036	fuse_do_setattr(file_dentry(file), &attr, file);
3037}
3038
3039static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3040{
3041	return round_up(off, fc->max_pages << PAGE_SHIFT);
3042}
3043
3044static ssize_t
3045fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3046{
3047	DECLARE_COMPLETION_ONSTACK(wait);
3048	ssize_t ret = 0;
3049	struct file *file = iocb->ki_filp;
3050	struct fuse_file *ff = file->private_data;
3051	bool async_dio = ff->fc->async_dio;
3052	loff_t pos = 0;
3053	struct inode *inode;
3054	loff_t i_size;
3055	size_t count = iov_iter_count(iter);
3056	loff_t offset = iocb->ki_pos;
3057	struct fuse_io_priv *io;
3058
3059	pos = offset;
3060	inode = file->f_mapping->host;
3061	i_size = i_size_read(inode);
3062
3063	if ((iov_iter_rw(iter) == READ) && (offset > i_size))
3064		return 0;
3065
3066	/* optimization for short read */
3067	if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
3068		if (offset >= i_size)
3069			return 0;
3070		iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
3071		count = iov_iter_count(iter);
3072	}
3073
3074	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3075	if (!io)
3076		return -ENOMEM;
3077	spin_lock_init(&io->lock);
3078	kref_init(&io->refcnt);
3079	io->reqs = 1;
3080	io->bytes = -1;
3081	io->size = 0;
3082	io->offset = offset;
3083	io->write = (iov_iter_rw(iter) == WRITE);
3084	io->err = 0;
3085	/*
3086	 * By default, we want to optimize all I/Os with async request
3087	 * submission to the client filesystem if supported.
3088	 */
3089	io->async = async_dio;
3090	io->iocb = iocb;
3091	io->blocking = is_sync_kiocb(iocb);
3092
3093	/*
3094	 * We cannot asynchronously extend the size of a file.
3095	 * In such case the aio will behave exactly like sync io.
3096	 */
3097	if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
3098		io->blocking = true;
3099
3100	if (io->async && io->blocking) {
3101		/*
3102		 * Additional reference to keep io around after
3103		 * calling fuse_aio_complete()
3104		 */
3105		kref_get(&io->refcnt);
3106		io->done = &wait;
3107	}
3108
3109	if (iov_iter_rw(iter) == WRITE) {
3110		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3111		fuse_invalidate_attr(inode);
3112	} else {
3113		ret = __fuse_direct_read(io, iter, &pos);
3114	}
3115
3116	if (io->async) {
3117		bool blocking = io->blocking;
3118
3119		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3120
3121		/* we have a non-extending, async request, so return */
3122		if (!blocking)
3123			return -EIOCBQUEUED;
3124
3125		wait_for_completion(&wait);
3126		ret = fuse_get_res_by_io(io);
3127	}
3128
3129	kref_put(&io->refcnt, fuse_io_release);
3130
3131	if (iov_iter_rw(iter) == WRITE) {
3132		if (ret > 0)
3133			fuse_write_update_size(inode, pos);
3134		else if (ret < 0 && offset + count > i_size)
3135			fuse_do_truncate(file);
3136	}
3137
3138	return ret;
3139}
3140
3141static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3142{
3143	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
3144
3145	if (!err)
3146		fuse_sync_writes(inode);
3147
3148	return err;
3149}
3150
3151static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3152				loff_t length)
3153{
3154	struct fuse_file *ff = file->private_data;
3155	struct inode *inode = file_inode(file);
3156	struct fuse_inode *fi = get_fuse_inode(inode);
3157	struct fuse_conn *fc = ff->fc;
3158	FUSE_ARGS(args);
3159	struct fuse_fallocate_in inarg = {
3160		.fh = ff->fh,
3161		.offset = offset,
3162		.length = length,
3163		.mode = mode
3164	};
3165	int err;
3166	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
3167			   (mode & FALLOC_FL_PUNCH_HOLE);
3168
3169	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3170		return -EOPNOTSUPP;
3171
3172	if (fc->no_fallocate)
3173		return -EOPNOTSUPP;
3174
3175	if (lock_inode) {
3176		inode_lock(inode);
3177		if (mode & FALLOC_FL_PUNCH_HOLE) {
3178			loff_t endbyte = offset + length - 1;
3179
3180			err = fuse_writeback_range(inode, offset, endbyte);
3181			if (err)
3182				goto out;
3183		}
3184	}
3185
3186	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3187	    offset + length > i_size_read(inode)) {
3188		err = inode_newsize_ok(inode, offset + length);
3189		if (err)
3190			goto out;
3191	}
3192
3193	if (!(mode & FALLOC_FL_KEEP_SIZE))
3194		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3195
3196	args.opcode = FUSE_FALLOCATE;
3197	args.nodeid = ff->nodeid;
3198	args.in_numargs = 1;
3199	args.in_args[0].size = sizeof(inarg);
3200	args.in_args[0].value = &inarg;
3201	err = fuse_simple_request(fc, &args);
3202	if (err == -ENOSYS) {
3203		fc->no_fallocate = 1;
3204		err = -EOPNOTSUPP;
3205	}
3206	if (err)
3207		goto out;
3208
3209	/* we could have extended the file */
3210	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3211		bool changed = fuse_write_update_size(inode, offset + length);
3212
3213		if (changed && fc->writeback_cache)
3214			file_update_time(file);
3215	}
3216
3217	if (mode & FALLOC_FL_PUNCH_HOLE)
3218		truncate_pagecache_range(inode, offset, offset + length - 1);
3219
3220	fuse_invalidate_attr(inode);
3221
3222out:
3223	if (!(mode & FALLOC_FL_KEEP_SIZE))
3224		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3225
3226	if (lock_inode)
3227		inode_unlock(inode);
3228
3229	return err;
3230}
3231
3232static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3233				      struct file *file_out, loff_t pos_out,
3234				      size_t len, unsigned int flags)
3235{
3236	struct fuse_file *ff_in = file_in->private_data;
3237	struct fuse_file *ff_out = file_out->private_data;
3238	struct inode *inode_in = file_inode(file_in);
3239	struct inode *inode_out = file_inode(file_out);
3240	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3241	struct fuse_conn *fc = ff_in->fc;
3242	FUSE_ARGS(args);
3243	struct fuse_copy_file_range_in inarg = {
3244		.fh_in = ff_in->fh,
3245		.off_in = pos_in,
3246		.nodeid_out = ff_out->nodeid,
3247		.fh_out = ff_out->fh,
3248		.off_out = pos_out,
3249		.len = len,
3250		.flags = flags
3251	};
3252	struct fuse_write_out outarg;
3253	ssize_t err;
3254	/* mark unstable when write-back is not used, and file_out gets
3255	 * extended */
3256	bool is_unstable = (!fc->writeback_cache) &&
3257			   ((pos_out + len) > inode_out->i_size);
3258
3259	if (fc->no_copy_file_range)
3260		return -EOPNOTSUPP;
3261
3262	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3263		return -EXDEV;
3264
3265	if (fc->writeback_cache) {
3266		inode_lock(inode_in);
3267		err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
3268		inode_unlock(inode_in);
3269		if (err)
3270			return err;
3271	}
3272
3273	inode_lock(inode_out);
3274
3275	err = file_modified(file_out);
3276	if (err)
3277		goto out;
3278
3279	if (fc->writeback_cache) {
3280		err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
3281		if (err)
3282			goto out;
3283	}
3284
3285	if (is_unstable)
3286		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3287
3288	args.opcode = FUSE_COPY_FILE_RANGE;
3289	args.nodeid = ff_in->nodeid;
3290	args.in_numargs = 1;
3291	args.in_args[0].size = sizeof(inarg);
3292	args.in_args[0].value = &inarg;
3293	args.out_numargs = 1;
3294	args.out_args[0].size = sizeof(outarg);
3295	args.out_args[0].value = &outarg;
3296	err = fuse_simple_request(fc, &args);
3297	if (err == -ENOSYS) {
3298		fc->no_copy_file_range = 1;
3299		err = -EOPNOTSUPP;
3300	}
3301	if (err)
3302		goto out;
3303
3304	if (fc->writeback_cache) {
3305		fuse_write_update_size(inode_out, pos_out + outarg.size);
3306		file_update_time(file_out);
3307	}
3308
3309	fuse_invalidate_attr(inode_out);
3310
3311	err = outarg.size;
3312out:
3313	if (is_unstable)
3314		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3315
3316	inode_unlock(inode_out);
3317	file_accessed(file_in);
3318
3319	return err;
3320}
3321
3322static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3323				    struct file *dst_file, loff_t dst_off,
3324				    size_t len, unsigned int flags)
3325{
3326	ssize_t ret;
3327
3328	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3329				     len, flags);
3330
3331	if (ret == -EOPNOTSUPP || ret == -EXDEV)
3332		ret = generic_copy_file_range(src_file, src_off, dst_file,
3333					      dst_off, len, flags);
3334	return ret;
3335}
3336
3337static const struct file_operations fuse_file_operations = {
3338	.llseek		= fuse_file_llseek,
3339	.read_iter	= fuse_file_read_iter,
3340	.write_iter	= fuse_file_write_iter,
3341	.mmap		= fuse_file_mmap,
3342	.open		= fuse_open,
3343	.flush		= fuse_flush,
3344	.release	= fuse_release,
3345	.fsync		= fuse_fsync,
3346	.lock		= fuse_file_lock,
3347	.flock		= fuse_file_flock,
3348	.splice_read	= generic_file_splice_read,
3349	.splice_write	= iter_file_splice_write,
3350	.unlocked_ioctl	= fuse_file_ioctl,
3351	.compat_ioctl	= fuse_file_compat_ioctl,
3352	.poll		= fuse_file_poll,
3353	.fallocate	= fuse_file_fallocate,
3354	.copy_file_range = fuse_copy_file_range,
3355};
3356
3357static const struct address_space_operations fuse_file_aops  = {
3358	.readpage	= fuse_readpage,
3359	.writepage	= fuse_writepage,
3360	.writepages	= fuse_writepages,
3361	.launder_page	= fuse_launder_page,
3362	.readpages	= fuse_readpages,
3363	.set_page_dirty	= __set_page_dirty_nobuffers,
3364	.bmap		= fuse_bmap,
3365	.direct_IO	= fuse_direct_IO,
3366	.write_begin	= fuse_write_begin,
3367	.write_end	= fuse_write_end,
3368};
3369
3370void fuse_init_file_inode(struct inode *inode)
3371{
3372	struct fuse_inode *fi = get_fuse_inode(inode);
3373
3374	inode->i_fop = &fuse_file_operations;
3375	inode->i_data.a_ops = &fuse_file_aops;
3376
3377	INIT_LIST_HEAD(&fi->write_files);
3378	INIT_LIST_HEAD(&fi->queued_writes);
3379	fi->writectr = 0;
3380	init_waitqueue_head(&fi->page_waitq);
3381	INIT_LIST_HEAD(&fi->writepages);
3382}