fs/read_write.c at v5.9-rc8 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / read_write.c
at v5.9-rc8 55 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29	.llseek		= generic_file_llseek,
  30	.read_iter	= generic_file_read_iter,
  31	.mmap		= generic_file_readonly_mmap,
  32	.splice_read	= generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39	return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:	file structure in question
  45 * @offset:	file offset to seek to
  46 * @maxsize:	maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56	if (offset < 0 && !unsigned_offsets(file))
  57		return -EINVAL;
  58	if (offset > maxsize)
  59		return -EINVAL;
  60
  61	if (offset != file->f_pos) {
  62		file->f_pos = offset;
  63		file->f_version = 0;
  64	}
  65	return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:	file structure to seek on
  72 * @offset:	file offset to seek to
  73 * @whence:	type of seek
  74 * @size:	max size of this file in file system
  75 * @eof:	offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87		loff_t maxsize, loff_t eof)
  88{
  89	switch (whence) {
  90	case SEEK_END:
  91		offset += eof;
  92		break;
  93	case SEEK_CUR:
  94		/*
  95		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96		 * position-querying operation.  Avoid rewriting the "same"
  97		 * f_pos value back to the file because a concurrent read(),
  98		 * write() or lseek() might have altered it
  99		 */
 100		if (offset == 0)
 101			return file->f_pos;
 102		/*
 103		 * f_lock protects against read/modify/write race with other
 104		 * SEEK_CURs. Note that parallel writes and reads behave
 105		 * like SEEK_SET.
 106		 */
 107		spin_lock(&file->f_lock);
 108		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109		spin_unlock(&file->f_lock);
 110		return offset;
 111	case SEEK_DATA:
 112		/*
 113		 * In the generic case the entire file is data, so as long as
 114		 * offset isn't at the end of the file then the offset is data.
 115		 */
 116		if ((unsigned long long)offset >= eof)
 117			return -ENXIO;
 118		break;
 119	case SEEK_HOLE:
 120		/*
 121		 * There is a virtual hole at the end of the file, so as long as
 122		 * offset isn't i_size or larger, return i_size.
 123		 */
 124		if ((unsigned long long)offset >= eof)
 125			return -ENXIO;
 126		offset = eof;
 127		break;
 128	}
 129
 130	return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:	file structure to seek on
 137 * @offset:	file offset to seek to
 138 * @whence:	type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146	struct inode *inode = file->f_mapping->host;
 147
 148	return generic_file_llseek_size(file, offset, whence,
 149					inode->i_sb->s_maxbytes,
 150					i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:	file structure to seek on
 157 * @offset:	file offset to seek to
 158 * @whence:	type of seek
 159 * @size:	size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164	switch (whence) {
 165	case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166		return generic_file_llseek_size(file, offset, whence,
 167						size, size);
 168	default:
 169		return -EINVAL;
 170	}
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:	file structure to seek on
 177 * @offset:	file offset to seek to
 178 * @whence:	type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183	switch (whence) {
 184	case SEEK_SET: case SEEK_CUR:
 185		return generic_file_llseek_size(file, offset, whence,
 186						OFFSET_MAX, 0);
 187	default:
 188		return -EINVAL;
 189	}
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:	file structure to seek on
 196 * @offset:	file offset to seek to
 197 * @whence:	type of seek
 198 * @size:	maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203	switch (whence) {
 204	case SEEK_SET: case SEEK_CUR:
 205		return generic_file_llseek_size(file, offset, whence,
 206						size, 0);
 207	default:
 208		return -EINVAL;
 209	}
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:	file structure to seek on
 216 * @offset:	file offset to seek to
 217 * @whence:	type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226	return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232	return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238	struct inode *inode = file_inode(file);
 239	loff_t retval;
 240
 241	inode_lock(inode);
 242	switch (whence) {
 243		case SEEK_END:
 244			offset += i_size_read(inode);
 245			break;
 246		case SEEK_CUR:
 247			if (offset == 0) {
 248				retval = file->f_pos;
 249				goto out;
 250			}
 251			offset += file->f_pos;
 252			break;
 253		case SEEK_DATA:
 254			/*
 255			 * In the generic case the entire file is data, so as
 256			 * long as offset isn't at the end of the file then the
 257			 * offset is data.
 258			 */
 259			if (offset >= inode->i_size) {
 260				retval = -ENXIO;
 261				goto out;
 262			}
 263			break;
 264		case SEEK_HOLE:
 265			/*
 266			 * There is a virtual hole at the end of the file, so
 267			 * as long as offset isn't i_size or larger, return
 268			 * i_size.
 269			 */
 270			if (offset >= inode->i_size) {
 271				retval = -ENXIO;
 272				goto out;
 273			}
 274			offset = inode->i_size;
 275			break;
 276	}
 277	retval = -EINVAL;
 278	if (offset >= 0 || unsigned_offsets(file)) {
 279		if (offset != file->f_pos) {
 280			file->f_pos = offset;
 281			file->f_version = 0;
 282		}
 283		retval = offset;
 284	}
 285out:
 286	inode_unlock(inode);
 287	return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293	loff_t (*fn)(struct file *, loff_t, int);
 294
 295	fn = no_llseek;
 296	if (file->f_mode & FMODE_LSEEK) {
 297		if (file->f_op->llseek)
 298			fn = file->f_op->llseek;
 299	}
 300	return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306	off_t retval;
 307	struct fd f = fdget_pos(fd);
 308	if (!f.file)
 309		return -EBADF;
 310
 311	retval = -EINVAL;
 312	if (whence <= SEEK_MAX) {
 313		loff_t res = vfs_llseek(f.file, offset, whence);
 314		retval = res;
 315		if (res != (loff_t)retval)
 316			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 317	}
 318	fdput_pos(f);
 319	return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324	return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330	return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
 335	defined(__ARCH_WANT_SYS_LLSEEK)
 336SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 337		unsigned long, offset_low, loff_t __user *, result,
 338		unsigned int, whence)
 339{
 340	int retval;
 341	struct fd f = fdget_pos(fd);
 342	loff_t offset;
 343
 344	if (!f.file)
 345		return -EBADF;
 346
 347	retval = -EINVAL;
 348	if (whence > SEEK_MAX)
 349		goto out_putf;
 350
 351	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 352			whence);
 353
 354	retval = (int)offset;
 355	if (offset >= 0) {
 356		retval = -EFAULT;
 357		if (!copy_to_user(result, &offset, sizeof(offset)))
 358			retval = 0;
 359	}
 360out_putf:
 361	fdput_pos(f);
 362	return retval;
 363}
 364#endif
 365
 366int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 367{
 368	struct inode *inode;
 369	int retval = -EINVAL;
 370
 371	inode = file_inode(file);
 372	if (unlikely((ssize_t) count < 0))
 373		return retval;
 374
 375	/*
 376	 * ranged mandatory locking does not apply to streams - it makes sense
 377	 * only for files where position has a meaning.
 378	 */
 379	if (ppos) {
 380		loff_t pos = *ppos;
 381
 382		if (unlikely(pos < 0)) {
 383			if (!unsigned_offsets(file))
 384				return retval;
 385			if (count >= -pos) /* both values are in 0..LLONG_MAX */
 386				return -EOVERFLOW;
 387		} else if (unlikely((loff_t) (pos + count) < 0)) {
 388			if (!unsigned_offsets(file))
 389				return retval;
 390		}
 391
 392		if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 393			retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 394					read_write == READ ? F_RDLCK : F_WRLCK);
 395			if (retval < 0)
 396				return retval;
 397		}
 398	}
 399
 400	return security_file_permission(file,
 401				read_write == READ ? MAY_READ : MAY_WRITE);
 402}
 403
 404static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 405{
 406	struct iovec iov = { .iov_base = buf, .iov_len = len };
 407	struct kiocb kiocb;
 408	struct iov_iter iter;
 409	ssize_t ret;
 410
 411	init_sync_kiocb(&kiocb, filp);
 412	kiocb.ki_pos = (ppos ? *ppos : 0);
 413	iov_iter_init(&iter, READ, &iov, 1, len);
 414
 415	ret = call_read_iter(filp, &kiocb, &iter);
 416	BUG_ON(ret == -EIOCBQUEUED);
 417	if (ppos)
 418		*ppos = kiocb.ki_pos;
 419	return ret;
 420}
 421
 422ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 423{
 424	mm_segment_t old_fs = get_fs();
 425	ssize_t ret;
 426
 427	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 428		return -EINVAL;
 429	if (!(file->f_mode & FMODE_CAN_READ))
 430		return -EINVAL;
 431
 432	if (count > MAX_RW_COUNT)
 433		count =  MAX_RW_COUNT;
 434	set_fs(KERNEL_DS);
 435	if (file->f_op->read)
 436		ret = file->f_op->read(file, (void __user *)buf, count, pos);
 437	else if (file->f_op->read_iter)
 438		ret = new_sync_read(file, (void __user *)buf, count, pos);
 439	else
 440		ret = -EINVAL;
 441	set_fs(old_fs);
 442	if (ret > 0) {
 443		fsnotify_access(file);
 444		add_rchar(current, ret);
 445	}
 446	inc_syscr(current);
 447	return ret;
 448}
 449
 450ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 451{
 452	ssize_t ret;
 453
 454	ret = rw_verify_area(READ, file, pos, count);
 455	if (ret)
 456		return ret;
 457	return __kernel_read(file, buf, count, pos);
 458}
 459EXPORT_SYMBOL(kernel_read);
 460
 461ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 462{
 463	ssize_t ret;
 464
 465	if (!(file->f_mode & FMODE_READ))
 466		return -EBADF;
 467	if (!(file->f_mode & FMODE_CAN_READ))
 468		return -EINVAL;
 469	if (unlikely(!access_ok(buf, count)))
 470		return -EFAULT;
 471
 472	ret = rw_verify_area(READ, file, pos, count);
 473	if (ret)
 474		return ret;
 475	if (count > MAX_RW_COUNT)
 476		count =  MAX_RW_COUNT;
 477
 478	if (file->f_op->read)
 479		ret = file->f_op->read(file, buf, count, pos);
 480	else if (file->f_op->read_iter)
 481		ret = new_sync_read(file, buf, count, pos);
 482	else
 483		ret = -EINVAL;
 484	if (ret > 0) {
 485		fsnotify_access(file);
 486		add_rchar(current, ret);
 487	}
 488	inc_syscr(current);
 489	return ret;
 490}
 491
 492static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 493{
 494	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 495	struct kiocb kiocb;
 496	struct iov_iter iter;
 497	ssize_t ret;
 498
 499	init_sync_kiocb(&kiocb, filp);
 500	kiocb.ki_pos = (ppos ? *ppos : 0);
 501	iov_iter_init(&iter, WRITE, &iov, 1, len);
 502
 503	ret = call_write_iter(filp, &kiocb, &iter);
 504	BUG_ON(ret == -EIOCBQUEUED);
 505	if (ret > 0 && ppos)
 506		*ppos = kiocb.ki_pos;
 507	return ret;
 508}
 509
 510/* caller is responsible for file_start_write/file_end_write */
 511ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 512{
 513	mm_segment_t old_fs;
 514	const char __user *p;
 515	ssize_t ret;
 516
 517	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 518		return -EBADF;
 519	if (!(file->f_mode & FMODE_CAN_WRITE))
 520		return -EINVAL;
 521
 522	old_fs = get_fs();
 523	set_fs(KERNEL_DS);
 524	p = (__force const char __user *)buf;
 525	if (count > MAX_RW_COUNT)
 526		count =  MAX_RW_COUNT;
 527	if (file->f_op->write)
 528		ret = file->f_op->write(file, p, count, pos);
 529	else if (file->f_op->write_iter)
 530		ret = new_sync_write(file, p, count, pos);
 531	else
 532		ret = -EINVAL;
 533	set_fs(old_fs);
 534	if (ret > 0) {
 535		fsnotify_modify(file);
 536		add_wchar(current, ret);
 537	}
 538	inc_syscw(current);
 539	return ret;
 540}
 541/*
 542 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 543 * but autofs is one of the few internal kernel users that actually
 544 * wants this _and_ can be built as a module. So we need to export
 545 * this symbol for autofs, even though it really isn't appropriate
 546 * for any other kernel modules.
 547 */
 548EXPORT_SYMBOL_GPL(__kernel_write);
 549
 550ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 551			    loff_t *pos)
 552{
 553	ssize_t ret;
 554
 555	ret = rw_verify_area(WRITE, file, pos, count);
 556	if (ret)
 557		return ret;
 558
 559	file_start_write(file);
 560	ret =  __kernel_write(file, buf, count, pos);
 561	file_end_write(file);
 562	return ret;
 563}
 564EXPORT_SYMBOL(kernel_write);
 565
 566ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 567{
 568	ssize_t ret;
 569
 570	if (!(file->f_mode & FMODE_WRITE))
 571		return -EBADF;
 572	if (!(file->f_mode & FMODE_CAN_WRITE))
 573		return -EINVAL;
 574	if (unlikely(!access_ok(buf, count)))
 575		return -EFAULT;
 576
 577	ret = rw_verify_area(WRITE, file, pos, count);
 578	if (ret)
 579		return ret;
 580	if (count > MAX_RW_COUNT)
 581		count =  MAX_RW_COUNT;
 582	file_start_write(file);
 583	if (file->f_op->write)
 584		ret = file->f_op->write(file, buf, count, pos);
 585	else if (file->f_op->write_iter)
 586		ret = new_sync_write(file, buf, count, pos);
 587	else
 588		ret = -EINVAL;
 589	if (ret > 0) {
 590		fsnotify_modify(file);
 591		add_wchar(current, ret);
 592	}
 593	inc_syscw(current);
 594	file_end_write(file);
 595	return ret;
 596}
 597
 598/* file_ppos returns &file->f_pos or NULL if file is stream */
 599static inline loff_t *file_ppos(struct file *file)
 600{
 601	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 602}
 603
 604ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 605{
 606	struct fd f = fdget_pos(fd);
 607	ssize_t ret = -EBADF;
 608
 609	if (f.file) {
 610		loff_t pos, *ppos = file_ppos(f.file);
 611		if (ppos) {
 612			pos = *ppos;
 613			ppos = &pos;
 614		}
 615		ret = vfs_read(f.file, buf, count, ppos);
 616		if (ret >= 0 && ppos)
 617			f.file->f_pos = pos;
 618		fdput_pos(f);
 619	}
 620	return ret;
 621}
 622
 623SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 624{
 625	return ksys_read(fd, buf, count);
 626}
 627
 628ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 629{
 630	struct fd f = fdget_pos(fd);
 631	ssize_t ret = -EBADF;
 632
 633	if (f.file) {
 634		loff_t pos, *ppos = file_ppos(f.file);
 635		if (ppos) {
 636			pos = *ppos;
 637			ppos = &pos;
 638		}
 639		ret = vfs_write(f.file, buf, count, ppos);
 640		if (ret >= 0 && ppos)
 641			f.file->f_pos = pos;
 642		fdput_pos(f);
 643	}
 644
 645	return ret;
 646}
 647
 648SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 649		size_t, count)
 650{
 651	return ksys_write(fd, buf, count);
 652}
 653
 654ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 655		     loff_t pos)
 656{
 657	struct fd f;
 658	ssize_t ret = -EBADF;
 659
 660	if (pos < 0)
 661		return -EINVAL;
 662
 663	f = fdget(fd);
 664	if (f.file) {
 665		ret = -ESPIPE;
 666		if (f.file->f_mode & FMODE_PREAD)
 667			ret = vfs_read(f.file, buf, count, &pos);
 668		fdput(f);
 669	}
 670
 671	return ret;
 672}
 673
 674SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 675			size_t, count, loff_t, pos)
 676{
 677	return ksys_pread64(fd, buf, count, pos);
 678}
 679
 680ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 681		      size_t count, loff_t pos)
 682{
 683	struct fd f;
 684	ssize_t ret = -EBADF;
 685
 686	if (pos < 0)
 687		return -EINVAL;
 688
 689	f = fdget(fd);
 690	if (f.file) {
 691		ret = -ESPIPE;
 692		if (f.file->f_mode & FMODE_PWRITE)  
 693			ret = vfs_write(f.file, buf, count, &pos);
 694		fdput(f);
 695	}
 696
 697	return ret;
 698}
 699
 700SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 701			 size_t, count, loff_t, pos)
 702{
 703	return ksys_pwrite64(fd, buf, count, pos);
 704}
 705
 706static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 707		loff_t *ppos, int type, rwf_t flags)
 708{
 709	struct kiocb kiocb;
 710	ssize_t ret;
 711
 712	init_sync_kiocb(&kiocb, filp);
 713	ret = kiocb_set_rw_flags(&kiocb, flags);
 714	if (ret)
 715		return ret;
 716	kiocb.ki_pos = (ppos ? *ppos : 0);
 717
 718	if (type == READ)
 719		ret = call_read_iter(filp, &kiocb, iter);
 720	else
 721		ret = call_write_iter(filp, &kiocb, iter);
 722	BUG_ON(ret == -EIOCBQUEUED);
 723	if (ppos)
 724		*ppos = kiocb.ki_pos;
 725	return ret;
 726}
 727
 728/* Do it by hand, with file-ops */
 729static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 730		loff_t *ppos, int type, rwf_t flags)
 731{
 732	ssize_t ret = 0;
 733
 734	if (flags & ~RWF_HIPRI)
 735		return -EOPNOTSUPP;
 736
 737	while (iov_iter_count(iter)) {
 738		struct iovec iovec = iov_iter_iovec(iter);
 739		ssize_t nr;
 740
 741		if (type == READ) {
 742			nr = filp->f_op->read(filp, iovec.iov_base,
 743					      iovec.iov_len, ppos);
 744		} else {
 745			nr = filp->f_op->write(filp, iovec.iov_base,
 746					       iovec.iov_len, ppos);
 747		}
 748
 749		if (nr < 0) {
 750			if (!ret)
 751				ret = nr;
 752			break;
 753		}
 754		ret += nr;
 755		if (nr != iovec.iov_len)
 756			break;
 757		iov_iter_advance(iter, nr);
 758	}
 759
 760	return ret;
 761}
 762
 763/**
 764 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 765 *     into the kernel and check that it is valid.
 766 *
 767 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 768 * @uvector: Pointer to the userspace array.
 769 * @nr_segs: Number of elements in userspace array.
 770 * @fast_segs: Number of elements in @fast_pointer.
 771 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 772 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 773 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 774 *     depending on which array was used.
 775 *
 776 * This function copies an array of &struct iovec of @nr_segs from
 777 * userspace into the kernel and checks that each element is valid (e.g.
 778 * it does not point to a kernel address or cause overflow by being too
 779 * large, etc.).
 780 *
 781 * As an optimization, the caller may provide a pointer to a small
 782 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 783 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 784 *
 785 * @ret_pointer will always point to the array that was used, so the
 786 * caller must take care not to call kfree() on it e.g. in case the
 787 * @fast_pointer array was used and it was allocated on the stack.
 788 *
 789 * Return: The total number of bytes covered by the iovec array on success
 790 *   or a negative error code on error.
 791 */
 792ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 793			      unsigned long nr_segs, unsigned long fast_segs,
 794			      struct iovec *fast_pointer,
 795			      struct iovec **ret_pointer)
 796{
 797	unsigned long seg;
 798	ssize_t ret;
 799	struct iovec *iov = fast_pointer;
 800
 801	/*
 802	 * SuS says "The readv() function *may* fail if the iovcnt argument
 803	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 804	 * traditionally returned zero for zero segments, so...
 805	 */
 806	if (nr_segs == 0) {
 807		ret = 0;
 808		goto out;
 809	}
 810
 811	/*
 812	 * First get the "struct iovec" from user memory and
 813	 * verify all the pointers
 814	 */
 815	if (nr_segs > UIO_MAXIOV) {
 816		ret = -EINVAL;
 817		goto out;
 818	}
 819	if (nr_segs > fast_segs) {
 820		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 821		if (iov == NULL) {
 822			ret = -ENOMEM;
 823			goto out;
 824		}
 825	}
 826	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 827		ret = -EFAULT;
 828		goto out;
 829	}
 830
 831	/*
 832	 * According to the Single Unix Specification we should return EINVAL
 833	 * if an element length is < 0 when cast to ssize_t or if the
 834	 * total length would overflow the ssize_t return value of the
 835	 * system call.
 836	 *
 837	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 838	 * overflow case.
 839	 */
 840	ret = 0;
 841	for (seg = 0; seg < nr_segs; seg++) {
 842		void __user *buf = iov[seg].iov_base;
 843		ssize_t len = (ssize_t)iov[seg].iov_len;
 844
 845		/* see if we we're about to use an invalid len or if
 846		 * it's about to overflow ssize_t */
 847		if (len < 0) {
 848			ret = -EINVAL;
 849			goto out;
 850		}
 851		if (type >= 0
 852		    && unlikely(!access_ok(buf, len))) {
 853			ret = -EFAULT;
 854			goto out;
 855		}
 856		if (len > MAX_RW_COUNT - ret) {
 857			len = MAX_RW_COUNT - ret;
 858			iov[seg].iov_len = len;
 859		}
 860		ret += len;
 861	}
 862out:
 863	*ret_pointer = iov;
 864	return ret;
 865}
 866
 867#ifdef CONFIG_COMPAT
 868ssize_t compat_rw_copy_check_uvector(int type,
 869		const struct compat_iovec __user *uvector, unsigned long nr_segs,
 870		unsigned long fast_segs, struct iovec *fast_pointer,
 871		struct iovec **ret_pointer)
 872{
 873	compat_ssize_t tot_len;
 874	struct iovec *iov = *ret_pointer = fast_pointer;
 875	ssize_t ret = 0;
 876	int seg;
 877
 878	/*
 879	 * SuS says "The readv() function *may* fail if the iovcnt argument
 880	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 881	 * traditionally returned zero for zero segments, so...
 882	 */
 883	if (nr_segs == 0)
 884		goto out;
 885
 886	ret = -EINVAL;
 887	if (nr_segs > UIO_MAXIOV)
 888		goto out;
 889	if (nr_segs > fast_segs) {
 890		ret = -ENOMEM;
 891		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 892		if (iov == NULL)
 893			goto out;
 894	}
 895	*ret_pointer = iov;
 896
 897	ret = -EFAULT;
 898	if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 899		goto out;
 900
 901	/*
 902	 * Single unix specification:
 903	 * We should -EINVAL if an element length is not >= 0 and fitting an
 904	 * ssize_t.
 905	 *
 906	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
 907	 * no overflow possibility.
 908	 */
 909	tot_len = 0;
 910	ret = -EINVAL;
 911	for (seg = 0; seg < nr_segs; seg++) {
 912		compat_uptr_t buf;
 913		compat_ssize_t len;
 914
 915		if (__get_user(len, &uvector->iov_len) ||
 916		   __get_user(buf, &uvector->iov_base)) {
 917			ret = -EFAULT;
 918			goto out;
 919		}
 920		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 921			goto out;
 922		if (type >= 0 &&
 923		    !access_ok(compat_ptr(buf), len)) {
 924			ret = -EFAULT;
 925			goto out;
 926		}
 927		if (len > MAX_RW_COUNT - tot_len)
 928			len = MAX_RW_COUNT - tot_len;
 929		tot_len += len;
 930		iov->iov_base = compat_ptr(buf);
 931		iov->iov_len = (compat_size_t) len;
 932		uvector++;
 933		iov++;
 934	}
 935	ret = tot_len;
 936
 937out:
 938	return ret;
 939}
 940#endif
 941
 942static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 943		loff_t *pos, rwf_t flags)
 944{
 945	size_t tot_len;
 946	ssize_t ret = 0;
 947
 948	if (!(file->f_mode & FMODE_READ))
 949		return -EBADF;
 950	if (!(file->f_mode & FMODE_CAN_READ))
 951		return -EINVAL;
 952
 953	tot_len = iov_iter_count(iter);
 954	if (!tot_len)
 955		goto out;
 956	ret = rw_verify_area(READ, file, pos, tot_len);
 957	if (ret < 0)
 958		return ret;
 959
 960	if (file->f_op->read_iter)
 961		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 962	else
 963		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 964out:
 965	if (ret >= 0)
 966		fsnotify_access(file);
 967	return ret;
 968}
 969
 970ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 971			   struct iov_iter *iter)
 972{
 973	size_t tot_len;
 974	ssize_t ret = 0;
 975
 976	if (!file->f_op->read_iter)
 977		return -EINVAL;
 978	if (!(file->f_mode & FMODE_READ))
 979		return -EBADF;
 980	if (!(file->f_mode & FMODE_CAN_READ))
 981		return -EINVAL;
 982
 983	tot_len = iov_iter_count(iter);
 984	if (!tot_len)
 985		goto out;
 986	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
 987	if (ret < 0)
 988		return ret;
 989
 990	ret = call_read_iter(file, iocb, iter);
 991out:
 992	if (ret >= 0)
 993		fsnotify_access(file);
 994	return ret;
 995}
 996EXPORT_SYMBOL(vfs_iocb_iter_read);
 997
 998ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 999		rwf_t flags)
1000{
1001	if (!file->f_op->read_iter)
1002		return -EINVAL;
1003	return do_iter_read(file, iter, ppos, flags);
1004}
1005EXPORT_SYMBOL(vfs_iter_read);
1006
1007static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
1008		loff_t *pos, rwf_t flags)
1009{
1010	size_t tot_len;
1011	ssize_t ret = 0;
1012
1013	if (!(file->f_mode & FMODE_WRITE))
1014		return -EBADF;
1015	if (!(file->f_mode & FMODE_CAN_WRITE))
1016		return -EINVAL;
1017
1018	tot_len = iov_iter_count(iter);
1019	if (!tot_len)
1020		return 0;
1021	ret = rw_verify_area(WRITE, file, pos, tot_len);
1022	if (ret < 0)
1023		return ret;
1024
1025	if (file->f_op->write_iter)
1026		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
1027	else
1028		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
1029	if (ret > 0)
1030		fsnotify_modify(file);
1031	return ret;
1032}
1033
1034ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
1035			    struct iov_iter *iter)
1036{
1037	size_t tot_len;
1038	ssize_t ret = 0;
1039
1040	if (!file->f_op->write_iter)
1041		return -EINVAL;
1042	if (!(file->f_mode & FMODE_WRITE))
1043		return -EBADF;
1044	if (!(file->f_mode & FMODE_CAN_WRITE))
1045		return -EINVAL;
1046
1047	tot_len = iov_iter_count(iter);
1048	if (!tot_len)
1049		return 0;
1050	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
1051	if (ret < 0)
1052		return ret;
1053
1054	ret = call_write_iter(file, iocb, iter);
1055	if (ret > 0)
1056		fsnotify_modify(file);
1057
1058	return ret;
1059}
1060EXPORT_SYMBOL(vfs_iocb_iter_write);
1061
1062ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
1063		rwf_t flags)
1064{
1065	if (!file->f_op->write_iter)
1066		return -EINVAL;
1067	return do_iter_write(file, iter, ppos, flags);
1068}
1069EXPORT_SYMBOL(vfs_iter_write);
1070
1071ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
1072		  unsigned long vlen, loff_t *pos, rwf_t flags)
1073{
1074	struct iovec iovstack[UIO_FASTIOV];
1075	struct iovec *iov = iovstack;
1076	struct iov_iter iter;
1077	ssize_t ret;
1078
1079	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1080	if (ret >= 0) {
1081		ret = do_iter_read(file, &iter, pos, flags);
1082		kfree(iov);
1083	}
1084
1085	return ret;
1086}
1087
1088static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1089		   unsigned long vlen, loff_t *pos, rwf_t flags)
1090{
1091	struct iovec iovstack[UIO_FASTIOV];
1092	struct iovec *iov = iovstack;
1093	struct iov_iter iter;
1094	ssize_t ret;
1095
1096	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1097	if (ret >= 0) {
1098		file_start_write(file);
1099		ret = do_iter_write(file, &iter, pos, flags);
1100		file_end_write(file);
1101		kfree(iov);
1102	}
1103	return ret;
1104}
1105
1106static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1107			unsigned long vlen, rwf_t flags)
1108{
1109	struct fd f = fdget_pos(fd);
1110	ssize_t ret = -EBADF;
1111
1112	if (f.file) {
1113		loff_t pos, *ppos = file_ppos(f.file);
1114		if (ppos) {
1115			pos = *ppos;
1116			ppos = &pos;
1117		}
1118		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
1119		if (ret >= 0 && ppos)
1120			f.file->f_pos = pos;
1121		fdput_pos(f);
1122	}
1123
1124	if (ret > 0)
1125		add_rchar(current, ret);
1126	inc_syscr(current);
1127	return ret;
1128}
1129
1130static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1131			 unsigned long vlen, rwf_t flags)
1132{
1133	struct fd f = fdget_pos(fd);
1134	ssize_t ret = -EBADF;
1135
1136	if (f.file) {
1137		loff_t pos, *ppos = file_ppos(f.file);
1138		if (ppos) {
1139			pos = *ppos;
1140			ppos = &pos;
1141		}
1142		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1143		if (ret >= 0 && ppos)
1144			f.file->f_pos = pos;
1145		fdput_pos(f);
1146	}
1147
1148	if (ret > 0)
1149		add_wchar(current, ret);
1150	inc_syscw(current);
1151	return ret;
1152}
1153
1154static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1155{
1156#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1157	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1158}
1159
1160static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1161			 unsigned long vlen, loff_t pos, rwf_t flags)
1162{
1163	struct fd f;
1164	ssize_t ret = -EBADF;
1165
1166	if (pos < 0)
1167		return -EINVAL;
1168
1169	f = fdget(fd);
1170	if (f.file) {
1171		ret = -ESPIPE;
1172		if (f.file->f_mode & FMODE_PREAD)
1173			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1174		fdput(f);
1175	}
1176
1177	if (ret > 0)
1178		add_rchar(current, ret);
1179	inc_syscr(current);
1180	return ret;
1181}
1182
1183static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1184			  unsigned long vlen, loff_t pos, rwf_t flags)
1185{
1186	struct fd f;
1187	ssize_t ret = -EBADF;
1188
1189	if (pos < 0)
1190		return -EINVAL;
1191
1192	f = fdget(fd);
1193	if (f.file) {
1194		ret = -ESPIPE;
1195		if (f.file->f_mode & FMODE_PWRITE)
1196			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1197		fdput(f);
1198	}
1199
1200	if (ret > 0)
1201		add_wchar(current, ret);
1202	inc_syscw(current);
1203	return ret;
1204}
1205
1206SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1207		unsigned long, vlen)
1208{
1209	return do_readv(fd, vec, vlen, 0);
1210}
1211
1212SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1213		unsigned long, vlen)
1214{
1215	return do_writev(fd, vec, vlen, 0);
1216}
1217
1218SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1219		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1220{
1221	loff_t pos = pos_from_hilo(pos_h, pos_l);
1222
1223	return do_preadv(fd, vec, vlen, pos, 0);
1224}
1225
1226SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1227		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1228		rwf_t, flags)
1229{
1230	loff_t pos = pos_from_hilo(pos_h, pos_l);
1231
1232	if (pos == -1)
1233		return do_readv(fd, vec, vlen, flags);
1234
1235	return do_preadv(fd, vec, vlen, pos, flags);
1236}
1237
1238SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1239		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1240{
1241	loff_t pos = pos_from_hilo(pos_h, pos_l);
1242
1243	return do_pwritev(fd, vec, vlen, pos, 0);
1244}
1245
1246SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1247		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1248		rwf_t, flags)
1249{
1250	loff_t pos = pos_from_hilo(pos_h, pos_l);
1251
1252	if (pos == -1)
1253		return do_writev(fd, vec, vlen, flags);
1254
1255	return do_pwritev(fd, vec, vlen, pos, flags);
1256}
1257
1258#ifdef CONFIG_COMPAT
1259static size_t compat_readv(struct file *file,
1260			   const struct compat_iovec __user *vec,
1261			   unsigned long vlen, loff_t *pos, rwf_t flags)
1262{
1263	struct iovec iovstack[UIO_FASTIOV];
1264	struct iovec *iov = iovstack;
1265	struct iov_iter iter;
1266	ssize_t ret;
1267
1268	ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1269	if (ret >= 0) {
1270		ret = do_iter_read(file, &iter, pos, flags);
1271		kfree(iov);
1272	}
1273	if (ret > 0)
1274		add_rchar(current, ret);
1275	inc_syscr(current);
1276	return ret;
1277}
1278
1279static size_t do_compat_readv(compat_ulong_t fd,
1280				 const struct compat_iovec __user *vec,
1281				 compat_ulong_t vlen, rwf_t flags)
1282{
1283	struct fd f = fdget_pos(fd);
1284	ssize_t ret;
1285	loff_t pos;
1286
1287	if (!f.file)
1288		return -EBADF;
1289	pos = f.file->f_pos;
1290	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1291	if (ret >= 0)
1292		f.file->f_pos = pos;
1293	fdput_pos(f);
1294	return ret;
1295
1296}
1297
1298COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1299		const struct compat_iovec __user *,vec,
1300		compat_ulong_t, vlen)
1301{
1302	return do_compat_readv(fd, vec, vlen, 0);
1303}
1304
1305static long do_compat_preadv64(unsigned long fd,
1306				  const struct compat_iovec __user *vec,
1307				  unsigned long vlen, loff_t pos, rwf_t flags)
1308{
1309	struct fd f;
1310	ssize_t ret;
1311
1312	if (pos < 0)
1313		return -EINVAL;
1314	f = fdget(fd);
1315	if (!f.file)
1316		return -EBADF;
1317	ret = -ESPIPE;
1318	if (f.file->f_mode & FMODE_PREAD)
1319		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1320	fdput(f);
1321	return ret;
1322}
1323
1324#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1325COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1326		const struct compat_iovec __user *,vec,
1327		unsigned long, vlen, loff_t, pos)
1328{
1329	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1330}
1331#endif
1332
1333COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1334		const struct compat_iovec __user *,vec,
1335		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1336{
1337	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1338
1339	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1340}
1341
1342#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1343COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1344		const struct compat_iovec __user *,vec,
1345		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1346{
1347	if (pos == -1)
1348		return do_compat_readv(fd, vec, vlen, flags);
1349
1350	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1351}
1352#endif
1353
1354COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1355		const struct compat_iovec __user *,vec,
1356		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1357		rwf_t, flags)
1358{
1359	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1360
1361	if (pos == -1)
1362		return do_compat_readv(fd, vec, vlen, flags);
1363
1364	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1365}
1366
1367static size_t compat_writev(struct file *file,
1368			    const struct compat_iovec __user *vec,
1369			    unsigned long vlen, loff_t *pos, rwf_t flags)
1370{
1371	struct iovec iovstack[UIO_FASTIOV];
1372	struct iovec *iov = iovstack;
1373	struct iov_iter iter;
1374	ssize_t ret;
1375
1376	ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1377	if (ret >= 0) {
1378		file_start_write(file);
1379		ret = do_iter_write(file, &iter, pos, flags);
1380		file_end_write(file);
1381		kfree(iov);
1382	}
1383	if (ret > 0)
1384		add_wchar(current, ret);
1385	inc_syscw(current);
1386	return ret;
1387}
1388
1389static size_t do_compat_writev(compat_ulong_t fd,
1390				  const struct compat_iovec __user* vec,
1391				  compat_ulong_t vlen, rwf_t flags)
1392{
1393	struct fd f = fdget_pos(fd);
1394	ssize_t ret;
1395	loff_t pos;
1396
1397	if (!f.file)
1398		return -EBADF;
1399	pos = f.file->f_pos;
1400	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1401	if (ret >= 0)
1402		f.file->f_pos = pos;
1403	fdput_pos(f);
1404	return ret;
1405}
1406
1407COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1408		const struct compat_iovec __user *, vec,
1409		compat_ulong_t, vlen)
1410{
1411	return do_compat_writev(fd, vec, vlen, 0);
1412}
1413
1414static long do_compat_pwritev64(unsigned long fd,
1415				   const struct compat_iovec __user *vec,
1416				   unsigned long vlen, loff_t pos, rwf_t flags)
1417{
1418	struct fd f;
1419	ssize_t ret;
1420
1421	if (pos < 0)
1422		return -EINVAL;
1423	f = fdget(fd);
1424	if (!f.file)
1425		return -EBADF;
1426	ret = -ESPIPE;
1427	if (f.file->f_mode & FMODE_PWRITE)
1428		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1429	fdput(f);
1430	return ret;
1431}
1432
1433#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1434COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1435		const struct compat_iovec __user *,vec,
1436		unsigned long, vlen, loff_t, pos)
1437{
1438	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1439}
1440#endif
1441
1442COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1443		const struct compat_iovec __user *,vec,
1444		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1445{
1446	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1447
1448	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1449}
1450
1451#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1452COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1453		const struct compat_iovec __user *,vec,
1454		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1455{
1456	if (pos == -1)
1457		return do_compat_writev(fd, vec, vlen, flags);
1458
1459	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1460}
1461#endif
1462
1463COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1464		const struct compat_iovec __user *,vec,
1465		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1466{
1467	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1468
1469	if (pos == -1)
1470		return do_compat_writev(fd, vec, vlen, flags);
1471
1472	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1473}
1474
1475#endif
1476
1477static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1478		  	   size_t count, loff_t max)
1479{
1480	struct fd in, out;
1481	struct inode *in_inode, *out_inode;
1482	loff_t pos;
1483	loff_t out_pos;
1484	ssize_t retval;
1485	int fl;
1486
1487	/*
1488	 * Get input file, and verify that it is ok..
1489	 */
1490	retval = -EBADF;
1491	in = fdget(in_fd);
1492	if (!in.file)
1493		goto out;
1494	if (!(in.file->f_mode & FMODE_READ))
1495		goto fput_in;
1496	retval = -ESPIPE;
1497	if (!ppos) {
1498		pos = in.file->f_pos;
1499	} else {
1500		pos = *ppos;
1501		if (!(in.file->f_mode & FMODE_PREAD))
1502			goto fput_in;
1503	}
1504	retval = rw_verify_area(READ, in.file, &pos, count);
1505	if (retval < 0)
1506		goto fput_in;
1507	if (count > MAX_RW_COUNT)
1508		count =  MAX_RW_COUNT;
1509
1510	/*
1511	 * Get output file, and verify that it is ok..
1512	 */
1513	retval = -EBADF;
1514	out = fdget(out_fd);
1515	if (!out.file)
1516		goto fput_in;
1517	if (!(out.file->f_mode & FMODE_WRITE))
1518		goto fput_out;
1519	in_inode = file_inode(in.file);
1520	out_inode = file_inode(out.file);
1521	out_pos = out.file->f_pos;
1522	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1523	if (retval < 0)
1524		goto fput_out;
1525
1526	if (!max)
1527		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1528
1529	if (unlikely(pos + count > max)) {
1530		retval = -EOVERFLOW;
1531		if (pos >= max)
1532			goto fput_out;
1533		count = max - pos;
1534	}
1535
1536	fl = 0;
1537#if 0
1538	/*
1539	 * We need to debate whether we can enable this or not. The
1540	 * man page documents EAGAIN return for the output at least,
1541	 * and the application is arguably buggy if it doesn't expect
1542	 * EAGAIN on a non-blocking file descriptor.
1543	 */
1544	if (in.file->f_flags & O_NONBLOCK)
1545		fl = SPLICE_F_NONBLOCK;
1546#endif
1547	file_start_write(out.file);
1548	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1549	file_end_write(out.file);
1550
1551	if (retval > 0) {
1552		add_rchar(current, retval);
1553		add_wchar(current, retval);
1554		fsnotify_access(in.file);
1555		fsnotify_modify(out.file);
1556		out.file->f_pos = out_pos;
1557		if (ppos)
1558			*ppos = pos;
1559		else
1560			in.file->f_pos = pos;
1561	}
1562
1563	inc_syscr(current);
1564	inc_syscw(current);
1565	if (pos > max)
1566		retval = -EOVERFLOW;
1567
1568fput_out:
1569	fdput(out);
1570fput_in:
1571	fdput(in);
1572out:
1573	return retval;
1574}
1575
1576SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1577{
1578	loff_t pos;
1579	off_t off;
1580	ssize_t ret;
1581
1582	if (offset) {
1583		if (unlikely(get_user(off, offset)))
1584			return -EFAULT;
1585		pos = off;
1586		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1587		if (unlikely(put_user(pos, offset)))
1588			return -EFAULT;
1589		return ret;
1590	}
1591
1592	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1593}
1594
1595SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1596{
1597	loff_t pos;
1598	ssize_t ret;
1599
1600	if (offset) {
1601		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1602			return -EFAULT;
1603		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1604		if (unlikely(put_user(pos, offset)))
1605			return -EFAULT;
1606		return ret;
1607	}
1608
1609	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1610}
1611
1612#ifdef CONFIG_COMPAT
1613COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1614		compat_off_t __user *, offset, compat_size_t, count)
1615{
1616	loff_t pos;
1617	off_t off;
1618	ssize_t ret;
1619
1620	if (offset) {
1621		if (unlikely(get_user(off, offset)))
1622			return -EFAULT;
1623		pos = off;
1624		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1625		if (unlikely(put_user(pos, offset)))
1626			return -EFAULT;
1627		return ret;
1628	}
1629
1630	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1631}
1632
1633COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1634		compat_loff_t __user *, offset, compat_size_t, count)
1635{
1636	loff_t pos;
1637	ssize_t ret;
1638
1639	if (offset) {
1640		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1641			return -EFAULT;
1642		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1643		if (unlikely(put_user(pos, offset)))
1644			return -EFAULT;
1645		return ret;
1646	}
1647
1648	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1649}
1650#endif
1651
1652/**
1653 * generic_copy_file_range - copy data between two files
1654 * @file_in:	file structure to read from
1655 * @pos_in:	file offset to read from
1656 * @file_out:	file structure to write data to
1657 * @pos_out:	file offset to write data to
1658 * @len:	amount of data to copy
1659 * @flags:	copy flags
1660 *
1661 * This is a generic filesystem helper to copy data from one file to another.
1662 * It has no constraints on the source or destination file owners - the files
1663 * can belong to different superblocks and different filesystem types. Short
1664 * copies are allowed.
1665 *
1666 * This should be called from the @file_out filesystem, as per the
1667 * ->copy_file_range() method.
1668 *
1669 * Returns the number of bytes copied or a negative error indicating the
1670 * failure.
1671 */
1672
1673ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1674				struct file *file_out, loff_t pos_out,
1675				size_t len, unsigned int flags)
1676{
1677	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1678				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1679}
1680EXPORT_SYMBOL(generic_copy_file_range);
1681
1682static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1683				  struct file *file_out, loff_t pos_out,
1684				  size_t len, unsigned int flags)
1685{
1686	/*
1687	 * Although we now allow filesystems to handle cross sb copy, passing
1688	 * a file of the wrong filesystem type to filesystem driver can result
1689	 * in an attempt to dereference the wrong type of ->private_data, so
1690	 * avoid doing that until we really have a good reason.  NFS defines
1691	 * several different file_system_type structures, but they all end up
1692	 * using the same ->copy_file_range() function pointer.
1693	 */
1694	if (file_out->f_op->copy_file_range &&
1695	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1696		return file_out->f_op->copy_file_range(file_in, pos_in,
1697						       file_out, pos_out,
1698						       len, flags);
1699
1700	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1701				       flags);
1702}
1703
1704/*
1705 * copy_file_range() differs from regular file read and write in that it
1706 * specifically allows return partial success.  When it does so is up to
1707 * the copy_file_range method.
1708 */
1709ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1710			    struct file *file_out, loff_t pos_out,
1711			    size_t len, unsigned int flags)
1712{
1713	ssize_t ret;
1714
1715	if (flags != 0)
1716		return -EINVAL;
1717
1718	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1719				       flags);
1720	if (unlikely(ret))
1721		return ret;
1722
1723	ret = rw_verify_area(READ, file_in, &pos_in, len);
1724	if (unlikely(ret))
1725		return ret;
1726
1727	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1728	if (unlikely(ret))
1729		return ret;
1730
1731	if (len == 0)
1732		return 0;
1733
1734	file_start_write(file_out);
1735
1736	/*
1737	 * Try cloning first, this is supported by more file systems, and
1738	 * more efficient if both clone and copy are supported (e.g. NFS).
1739	 */
1740	if (file_in->f_op->remap_file_range &&
1741	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1742		loff_t cloned;
1743
1744		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1745				file_out, pos_out,
1746				min_t(loff_t, MAX_RW_COUNT, len),
1747				REMAP_FILE_CAN_SHORTEN);
1748		if (cloned > 0) {
1749			ret = cloned;
1750			goto done;
1751		}
1752	}
1753
1754	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1755				flags);
1756	WARN_ON_ONCE(ret == -EOPNOTSUPP);
1757done:
1758	if (ret > 0) {
1759		fsnotify_access(file_in);
1760		add_rchar(current, ret);
1761		fsnotify_modify(file_out);
1762		add_wchar(current, ret);
1763	}
1764
1765	inc_syscr(current);
1766	inc_syscw(current);
1767
1768	file_end_write(file_out);
1769
1770	return ret;
1771}
1772EXPORT_SYMBOL(vfs_copy_file_range);
1773
1774SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1775		int, fd_out, loff_t __user *, off_out,
1776		size_t, len, unsigned int, flags)
1777{
1778	loff_t pos_in;
1779	loff_t pos_out;
1780	struct fd f_in;
1781	struct fd f_out;
1782	ssize_t ret = -EBADF;
1783
1784	f_in = fdget(fd_in);
1785	if (!f_in.file)
1786		goto out2;
1787
1788	f_out = fdget(fd_out);
1789	if (!f_out.file)
1790		goto out1;
1791
1792	ret = -EFAULT;
1793	if (off_in) {
1794		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1795			goto out;
1796	} else {
1797		pos_in = f_in.file->f_pos;
1798	}
1799
1800	if (off_out) {
1801		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1802			goto out;
1803	} else {
1804		pos_out = f_out.file->f_pos;
1805	}
1806
1807	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1808				  flags);
1809	if (ret > 0) {
1810		pos_in += ret;
1811		pos_out += ret;
1812
1813		if (off_in) {
1814			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1815				ret = -EFAULT;
1816		} else {
1817			f_in.file->f_pos = pos_in;
1818		}
1819
1820		if (off_out) {
1821			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1822				ret = -EFAULT;
1823		} else {
1824			f_out.file->f_pos = pos_out;
1825		}
1826	}
1827
1828out:
1829	fdput(f_out);
1830out1:
1831	fdput(f_in);
1832out2:
1833	return ret;
1834}
1835
1836static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1837			     bool write)
1838{
1839	struct inode *inode = file_inode(file);
1840
1841	if (unlikely(pos < 0 || len < 0))
1842		return -EINVAL;
1843
1844	 if (unlikely((loff_t) (pos + len) < 0))
1845		return -EINVAL;
1846
1847	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1848		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1849		int retval;
1850
1851		retval = locks_mandatory_area(inode, file, pos, end,
1852				write ? F_WRLCK : F_RDLCK);
1853		if (retval < 0)
1854			return retval;
1855	}
1856
1857	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1858}
1859/*
1860 * Ensure that we don't remap a partial EOF block in the middle of something
1861 * else.  Assume that the offsets have already been checked for block
1862 * alignment.
1863 *
1864 * For clone we only link a partial EOF block above or at the destination file's
1865 * EOF.  For deduplication we accept a partial EOF block only if it ends at the
1866 * destination file's EOF (can not link it into the middle of a file).
1867 *
1868 * Shorten the request if possible.
1869 */
1870static int generic_remap_check_len(struct inode *inode_in,
1871				   struct inode *inode_out,
1872				   loff_t pos_out,
1873				   loff_t *len,
1874				   unsigned int remap_flags)
1875{
1876	u64 blkmask = i_blocksize(inode_in) - 1;
1877	loff_t new_len = *len;
1878
1879	if ((*len & blkmask) == 0)
1880		return 0;
1881
1882	if (pos_out + *len < i_size_read(inode_out))
1883		new_len &= ~blkmask;
1884
1885	if (new_len == *len)
1886		return 0;
1887
1888	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1889		*len = new_len;
1890		return 0;
1891	}
1892
1893	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1894}
1895
1896/* Read a page's worth of file data into the page cache. */
1897static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1898{
1899	struct page *page;
1900
1901	page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1902	if (IS_ERR(page))
1903		return page;
1904	if (!PageUptodate(page)) {
1905		put_page(page);
1906		return ERR_PTR(-EIO);
1907	}
1908	return page;
1909}
1910
1911/*
1912 * Lock two pages, ensuring that we lock in offset order if the pages are from
1913 * the same file.
1914 */
1915static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1916{
1917	/* Always lock in order of increasing index. */
1918	if (page1->index > page2->index)
1919		swap(page1, page2);
1920
1921	lock_page(page1);
1922	if (page1 != page2)
1923		lock_page(page2);
1924}
1925
1926/* Unlock two pages, being careful not to unlock the same page twice. */
1927static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1928{
1929	unlock_page(page1);
1930	if (page1 != page2)
1931		unlock_page(page2);
1932}
1933
1934/*
1935 * Compare extents of two files to see if they are the same.
1936 * Caller must have locked both inodes to prevent write races.
1937 */
1938static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1939					 struct inode *dest, loff_t destoff,
1940					 loff_t len, bool *is_same)
1941{
1942	loff_t src_poff;
1943	loff_t dest_poff;
1944	void *src_addr;
1945	void *dest_addr;
1946	struct page *src_page;
1947	struct page *dest_page;
1948	loff_t cmp_len;
1949	bool same;
1950	int error;
1951
1952	error = -EINVAL;
1953	same = true;
1954	while (len) {
1955		src_poff = srcoff & (PAGE_SIZE - 1);
1956		dest_poff = destoff & (PAGE_SIZE - 1);
1957		cmp_len = min(PAGE_SIZE - src_poff,
1958			      PAGE_SIZE - dest_poff);
1959		cmp_len = min(cmp_len, len);
1960		if (cmp_len <= 0)
1961			goto out_error;
1962
1963		src_page = vfs_dedupe_get_page(src, srcoff);
1964		if (IS_ERR(src_page)) {
1965			error = PTR_ERR(src_page);
1966			goto out_error;
1967		}
1968		dest_page = vfs_dedupe_get_page(dest, destoff);
1969		if (IS_ERR(dest_page)) {
1970			error = PTR_ERR(dest_page);
1971			put_page(src_page);
1972			goto out_error;
1973		}
1974
1975		vfs_lock_two_pages(src_page, dest_page);
1976
1977		/*
1978		 * Now that we've locked both pages, make sure they're still
1979		 * mapped to the file data we're interested in.  If not,
1980		 * someone is invalidating pages on us and we lose.
1981		 */
1982		if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1983		    src_page->mapping != src->i_mapping ||
1984		    dest_page->mapping != dest->i_mapping) {
1985			same = false;
1986			goto unlock;
1987		}
1988
1989		src_addr = kmap_atomic(src_page);
1990		dest_addr = kmap_atomic(dest_page);
1991
1992		flush_dcache_page(src_page);
1993		flush_dcache_page(dest_page);
1994
1995		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1996			same = false;
1997
1998		kunmap_atomic(dest_addr);
1999		kunmap_atomic(src_addr);
2000unlock:
2001		vfs_unlock_two_pages(src_page, dest_page);
2002		put_page(dest_page);
2003		put_page(src_page);
2004
2005		if (!same)
2006			break;
2007
2008		srcoff += cmp_len;
2009		destoff += cmp_len;
2010		len -= cmp_len;
2011	}
2012
2013	*is_same = same;
2014	return 0;
2015
2016out_error:
2017	return error;
2018}
2019
2020/*
2021 * Check that the two inodes are eligible for cloning, the ranges make
2022 * sense, and then flush all dirty data.  Caller must ensure that the
2023 * inodes have been locked against any other modifications.
2024 *
2025 * If there's an error, then the usual negative error code is returned.
2026 * Otherwise returns 0 with *len set to the request length.
2027 */
2028int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2029				  struct file *file_out, loff_t pos_out,
2030				  loff_t *len, unsigned int remap_flags)
2031{
2032	struct inode *inode_in = file_inode(file_in);
2033	struct inode *inode_out = file_inode(file_out);
2034	bool same_inode = (inode_in == inode_out);
2035	int ret;
2036
2037	/* Don't touch certain kinds of inodes */
2038	if (IS_IMMUTABLE(inode_out))
2039		return -EPERM;
2040
2041	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
2042		return -ETXTBSY;
2043
2044	/* Don't reflink dirs, pipes, sockets... */
2045	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
2046		return -EISDIR;
2047	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
2048		return -EINVAL;
2049
2050	/* Zero length dedupe exits immediately; reflink goes to EOF. */
2051	if (*len == 0) {
2052		loff_t isize = i_size_read(inode_in);
2053
2054		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
2055			return 0;
2056		if (pos_in > isize)
2057			return -EINVAL;
2058		*len = isize - pos_in;
2059		if (*len == 0)
2060			return 0;
2061	}
2062
2063	/* Check that we don't violate system file offset limits. */
2064	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
2065			remap_flags);
2066	if (ret)
2067		return ret;
2068
2069	/* Wait for the completion of any pending IOs on both files */
2070	inode_dio_wait(inode_in);
2071	if (!same_inode)
2072		inode_dio_wait(inode_out);
2073
2074	ret = filemap_write_and_wait_range(inode_in->i_mapping,
2075			pos_in, pos_in + *len - 1);
2076	if (ret)
2077		return ret;
2078
2079	ret = filemap_write_and_wait_range(inode_out->i_mapping,
2080			pos_out, pos_out + *len - 1);
2081	if (ret)
2082		return ret;
2083
2084	/*
2085	 * Check that the extents are the same.
2086	 */
2087	if (remap_flags & REMAP_FILE_DEDUP) {
2088		bool		is_same = false;
2089
2090		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2091				inode_out, pos_out, *len, &is_same);
2092		if (ret)
2093			return ret;
2094		if (!is_same)
2095			return -EBADE;
2096	}
2097
2098	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
2099			remap_flags);
2100	if (ret)
2101		return ret;
2102
2103	/* If can't alter the file contents, we're done. */
2104	if (!(remap_flags & REMAP_FILE_DEDUP))
2105		ret = file_modified(file_out);
2106
2107	return ret;
2108}
2109EXPORT_SYMBOL(generic_remap_file_range_prep);
2110
2111loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
2112			   struct file *file_out, loff_t pos_out,
2113			   loff_t len, unsigned int remap_flags)
2114{
2115	loff_t ret;
2116
2117	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
2118
2119	/*
2120	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2121	 * the same mount. Practically, they only need to be on the same file
2122	 * system.
2123	 */
2124	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
2125		return -EXDEV;
2126
2127	ret = generic_file_rw_checks(file_in, file_out);
2128	if (ret < 0)
2129		return ret;
2130
2131	if (!file_in->f_op->remap_file_range)
2132		return -EOPNOTSUPP;
2133
2134	ret = remap_verify_area(file_in, pos_in, len, false);
2135	if (ret)
2136		return ret;
2137
2138	ret = remap_verify_area(file_out, pos_out, len, true);
2139	if (ret)
2140		return ret;
2141
2142	ret = file_in->f_op->remap_file_range(file_in, pos_in,
2143			file_out, pos_out, len, remap_flags);
2144	if (ret < 0)
2145		return ret;
2146
2147	fsnotify_access(file_in);
2148	fsnotify_modify(file_out);
2149	return ret;
2150}
2151EXPORT_SYMBOL(do_clone_file_range);
2152
2153loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
2154			    struct file *file_out, loff_t pos_out,
2155			    loff_t len, unsigned int remap_flags)
2156{
2157	loff_t ret;
2158
2159	file_start_write(file_out);
2160	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2161				  remap_flags);
2162	file_end_write(file_out);
2163
2164	return ret;
2165}
2166EXPORT_SYMBOL(vfs_clone_file_range);
2167
2168/* Check whether we are allowed to dedupe the destination file */
2169static bool allow_file_dedupe(struct file *file)
2170{
2171	if (capable(CAP_SYS_ADMIN))
2172		return true;
2173	if (file->f_mode & FMODE_WRITE)
2174		return true;
2175	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
2176		return true;
2177	if (!inode_permission(file_inode(file), MAY_WRITE))
2178		return true;
2179	return false;
2180}
2181
2182loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2183				 struct file *dst_file, loff_t dst_pos,
2184				 loff_t len, unsigned int remap_flags)
2185{
2186	loff_t ret;
2187
2188	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2189				     REMAP_FILE_CAN_SHORTEN));
2190
2191	ret = mnt_want_write_file(dst_file);
2192	if (ret)
2193		return ret;
2194
2195	ret = remap_verify_area(dst_file, dst_pos, len, true);
2196	if (ret < 0)
2197		goto out_drop_write;
2198
2199	ret = -EPERM;
2200	if (!allow_file_dedupe(dst_file))
2201		goto out_drop_write;
2202
2203	ret = -EXDEV;
2204	if (src_file->f_path.mnt != dst_file->f_path.mnt)
2205		goto out_drop_write;
2206
2207	ret = -EISDIR;
2208	if (S_ISDIR(file_inode(dst_file)->i_mode))
2209		goto out_drop_write;
2210
2211	ret = -EINVAL;
2212	if (!dst_file->f_op->remap_file_range)
2213		goto out_drop_write;
2214
2215	if (len == 0) {
2216		ret = 0;
2217		goto out_drop_write;
2218	}
2219
2220	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
2221			dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
2222out_drop_write:
2223	mnt_drop_write_file(dst_file);
2224
2225	return ret;
2226}
2227EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2228
2229int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2230{
2231	struct file_dedupe_range_info *info;
2232	struct inode *src = file_inode(file);
2233	u64 off;
2234	u64 len;
2235	int i;
2236	int ret;
2237	u16 count = same->dest_count;
2238	loff_t deduped;
2239
2240	if (!(file->f_mode & FMODE_READ))
2241		return -EINVAL;
2242
2243	if (same->reserved1 || same->reserved2)
2244		return -EINVAL;
2245
2246	off = same->src_offset;
2247	len = same->src_length;
2248
2249	if (S_ISDIR(src->i_mode))
2250		return -EISDIR;
2251
2252	if (!S_ISREG(src->i_mode))
2253		return -EINVAL;
2254
2255	if (!file->f_op->remap_file_range)
2256		return -EOPNOTSUPP;
2257
2258	ret = remap_verify_area(file, off, len, false);
2259	if (ret < 0)
2260		return ret;
2261	ret = 0;
2262
2263	if (off + len > i_size_read(src))
2264		return -EINVAL;
2265
2266	/* Arbitrary 1G limit on a single dedupe request, can be raised. */
2267	len = min_t(u64, len, 1 << 30);
2268
2269	/* pre-format output fields to sane values */
2270	for (i = 0; i < count; i++) {
2271		same->info[i].bytes_deduped = 0ULL;
2272		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2273	}
2274
2275	for (i = 0, info = same->info; i < count; i++, info++) {
2276		struct fd dst_fd = fdget(info->dest_fd);
2277		struct file *dst_file = dst_fd.file;
2278
2279		if (!dst_file) {
2280			info->status = -EBADF;
2281			goto next_loop;
2282		}
2283
2284		if (info->reserved) {
2285			info->status = -EINVAL;
2286			goto next_fdput;
2287		}
2288
2289		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2290						    info->dest_offset, len,
2291						    REMAP_FILE_CAN_SHORTEN);
2292		if (deduped == -EBADE)
2293			info->status = FILE_DEDUPE_RANGE_DIFFERS;
2294		else if (deduped < 0)
2295			info->status = deduped;
2296		else
2297			info->bytes_deduped = len;
2298
2299next_fdput:
2300		fdput(dst_fd);
2301next_loop:
2302		if (fatal_signal_pending(current))
2303			break;
2304	}
2305	return ret;
2306}
2307EXPORT_SYMBOL(vfs_dedupe_file_range);