Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfs: add copy_file_range syscall and vfs helper

Add a copy_file_range() system call for offloading copies between
regular files.

This gives an interface to underlying layers of the storage stack which
can copy without reading and writing all the data. There are a few
candidates that should support copy offloading in the nearer term:

- btrfs shares extent references with its clone ioctl
- NFS has patches to add a COPY command which copies on the server
- SCSI has a family of XCOPY commands which copy in the device

This system call avoids the complexity of also accelerating the creation
of the destination file by operating on an existing destination file
descriptor, not a path.

Currently the high level vfs entry point limits copy offloading to files
on the same mount and super (and not in the same file). This can be
relaxed if we get implementations which can copy between file systems
safely.

Signed-off-by: Zach Brown <zab@redhat.com>
[Anna Schumaker: Change -EINVAL to -EBADF during file verification,
Change flags parameter from int to unsigned int,
Add function to include/linux/syscalls.h,
Check copy len after file open mode,
Don't forbid ranges inside the same file,
Use rw_verify_area() to veriy ranges,
Use file_out rather than file_in,
Add COPY_FR_REFLINK flag]
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Zach Brown and committed by
Al Viro
29732938 31ade3b8

+130 -1
+120
fs/read_write.c
··· 16 16 #include <linux/pagemap.h> 17 17 #include <linux/splice.h> 18 18 #include <linux/compat.h> 19 + #include <linux/mount.h> 19 20 #include "internal.h" 20 21 21 22 #include <asm/uaccess.h> ··· 1328 1327 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1329 1328 } 1330 1329 #endif 1330 + 1331 + /* 1332 + * copy_file_range() differs from regular file read and write in that it 1333 + * specifically allows return partial success. When it does so is up to 1334 + * the copy_file_range method. 1335 + */ 1336 + ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1337 + struct file *file_out, loff_t pos_out, 1338 + size_t len, unsigned int flags) 1339 + { 1340 + struct inode *inode_in = file_inode(file_in); 1341 + struct inode *inode_out = file_inode(file_out); 1342 + ssize_t ret; 1343 + 1344 + if (flags != 0) 1345 + return -EINVAL; 1346 + 1347 + /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */ 1348 + ret = rw_verify_area(READ, file_in, &pos_in, len); 1349 + if (ret >= 0) 1350 + ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1351 + if (ret < 0) 1352 + return ret; 1353 + 1354 + if (!(file_in->f_mode & FMODE_READ) || 1355 + !(file_out->f_mode & FMODE_WRITE) || 1356 + (file_out->f_flags & O_APPEND) || 1357 + !file_out->f_op->copy_file_range) 1358 + return -EBADF; 1359 + 1360 + /* this could be relaxed once a method supports cross-fs copies */ 1361 + if (inode_in->i_sb != inode_out->i_sb) 1362 + return -EXDEV; 1363 + 1364 + if (len == 0) 1365 + return 0; 1366 + 1367 + ret = mnt_want_write_file(file_out); 1368 + if (ret) 1369 + return ret; 1370 + 1371 + ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, 1372 + len, flags); 1373 + if (ret > 0) { 1374 + fsnotify_access(file_in); 1375 + add_rchar(current, ret); 1376 + fsnotify_modify(file_out); 1377 + add_wchar(current, ret); 1378 + } 1379 + inc_syscr(current); 1380 + inc_syscw(current); 1381 + 1382 + mnt_drop_write_file(file_out); 1383 + 1384 + return ret; 1385 + } 1386 + EXPORT_SYMBOL(vfs_copy_file_range); 1387 + 1388 + SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1389 + int, fd_out, loff_t __user *, off_out, 1390 + size_t, len, unsigned int, flags) 1391 + { 1392 + loff_t pos_in; 1393 + loff_t pos_out; 1394 + struct fd f_in; 1395 + struct fd f_out; 1396 + ssize_t ret = -EBADF; 1397 + 1398 + f_in = fdget(fd_in); 1399 + if (!f_in.file) 1400 + goto out2; 1401 + 1402 + f_out = fdget(fd_out); 1403 + if (!f_out.file) 1404 + goto out1; 1405 + 1406 + ret = -EFAULT; 1407 + if (off_in) { 1408 + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1409 + goto out; 1410 + } else { 1411 + pos_in = f_in.file->f_pos; 1412 + } 1413 + 1414 + if (off_out) { 1415 + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1416 + goto out; 1417 + } else { 1418 + pos_out = f_out.file->f_pos; 1419 + } 1420 + 1421 + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, 1422 + flags); 1423 + if (ret > 0) { 1424 + pos_in += ret; 1425 + pos_out += ret; 1426 + 1427 + if (off_in) { 1428 + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1429 + ret = -EFAULT; 1430 + } else { 1431 + f_in.file->f_pos = pos_in; 1432 + } 1433 + 1434 + if (off_out) { 1435 + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1436 + ret = -EFAULT; 1437 + } else { 1438 + f_out.file->f_pos = pos_out; 1439 + } 1440 + } 1441 + 1442 + out: 1443 + fdput(f_out); 1444 + out1: 1445 + fdput(f_in); 1446 + out2: 1447 + return ret; 1448 + }
+3
include/linux/fs.h
··· 1629 1629 #ifndef CONFIG_MMU 1630 1630 unsigned (*mmap_capabilities)(struct file *); 1631 1631 #endif 1632 + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); 1632 1633 }; 1633 1634 1634 1635 struct inode_operations { ··· 1681 1680 unsigned long, loff_t *); 1682 1681 extern ssize_t vfs_writev(struct file *, const struct iovec __user *, 1683 1682 unsigned long, loff_t *); 1683 + extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, 1684 + loff_t, size_t, unsigned int); 1684 1685 1685 1686 struct super_operations { 1686 1687 struct inode *(*alloc_inode)(struct super_block *sb);
+3
include/linux/syscalls.h
··· 886 886 const char __user *const __user *envp, int flags); 887 887 888 888 asmlinkage long sys_membarrier(int cmd, int flags); 889 + asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, 890 + int fd_out, loff_t __user *off_out, 891 + size_t len, unsigned int flags); 889 892 890 893 asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); 891 894
+3 -1
include/uapi/asm-generic/unistd.h
··· 715 715 __SYSCALL(__NR_membarrier, sys_membarrier) 716 716 #define __NR_mlock2 284 717 717 __SYSCALL(__NR_mlock2, sys_mlock2) 718 + #define __NR_copy_file_range 285 719 + __SYSCALL(__NR_copy_file_range, sys_copy_file_range) 718 720 719 721 #undef __NR_syscalls 720 - #define __NR_syscalls 285 722 + #define __NR_syscalls 286 721 723 722 724 /* 723 725 * All syscalls below here should go away really,
+1
kernel/sys_ni.c
··· 174 174 cond_syscall(sys_setfsgid); 175 175 cond_syscall(sys_capget); 176 176 cond_syscall(sys_capset); 177 + cond_syscall(sys_copy_file_range); 177 178 178 179 /* arch-specific weak syscall entries */ 179 180 cond_syscall(sys_pciconfig_read);