Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs rw updates from Christian Brauner:
"This contains updates from Amir for read-write backing file helpers
for stacking filesystems such as overlayfs:

- Fanotify is currently in the process of introducing pre content
events. Roughly, a new permission event will be added indicating
that it is safe to write to the file being accessed. These events
are used by hierarchical storage managers to e.g., fill the content
of files on first access.

During that work we noticed that our current permission checking is
inconsistent in rw_verify_area() and remap_verify_area().
Especially in the splice code permission checking is done multiple
times. For example, one time for the whole range and then again for
partial ranges inside the iterator.

In addition, we mostly do permission checking before we call
file_start_write() except for a few places where we call it after.
For pre-content events we need such permission checking to be done
before file_start_write(). So this is a nice reason to clean this
all up.

After this series, all permission checking is done before
file_start_write().

As part of this cleanup we also massaged the splice code a bit. We
got rid of a few helpers because we are alredy drowning in special
read-write helpers. We also cleaned up the return types for splice
helpers.

- Introduce generic read-write helpers for backing files. This lifts
some overlayfs code to common code so it can be used by the FUSE
passthrough work coming in over the next cycles. Make Amir and
Miklos the maintainers for this new subsystem of the vfs"

* tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (30 commits)
fs: fix __sb_write_started() kerneldoc formatting
fs: factor out backing_file_mmap() helper
fs: factor out backing_file_splice_{read,write}() helpers
fs: factor out backing_file_{read,write}_iter() helpers
fs: prepare for stackable filesystems backing file helpers
fsnotify: optionally pass access range in file permission hooks
fsnotify: assert that file_start_write() is not held in permission hooks
fsnotify: split fsnotify_perm() into two hooks
fs: use splice_copy_file_range() inline helper
splice: return type ssize_t from all helpers
fs: use do_splice_direct() for nfsd/ksmbd server-side-copy
fs: move file_start_write() into direct_splice_actor()
fs: fork splice_file_range() from do_splice_direct()
fs: create {sb,file}_write_not_started() helpers
fs: create file_write_started() helper
fs: create __sb_write_started() helper
fs: move kiocb_start_write() into vfs_iocb_iter_write()
fs: move permission hook out of do_iter_read()
fs: move permission hook out of do_iter_write()
fs: move file_start_write() into vfs_iter_write()
...

+956 -582
+9
MAINTAINERS
··· 8143 8143 F: fs/iomap/ 8144 8144 F: include/linux/iomap.h 8145 8145 8146 + FILESYSTEMS [STACKABLE] 8147 + M: Miklos Szeredi <miklos@szeredi.hu> 8148 + M: Amir Goldstein <amir73il@gmail.com> 8149 + L: linux-fsdevel@vger.kernel.org 8150 + L: linux-unionfs@vger.kernel.org 8151 + S: Maintained 8152 + F: fs/backing-file.c 8153 + F: include/linux/backing-file.h 8154 + 8146 8155 FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER 8147 8156 M: Riku Voipio <riku.voipio@iki.fi> 8148 8157 L: linux-hwmon@vger.kernel.org
-2
drivers/block/loop.c
··· 245 245 246 246 iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); 247 247 248 - file_start_write(file); 249 248 bw = vfs_iter_write(file, &i, ppos, 0); 250 - file_end_write(file); 251 249 252 250 if (likely(bw == bvec->bv_len)) 253 251 return 0;
+4
fs/Kconfig
··· 18 18 config FS_IOMAP 19 19 bool 20 20 21 + # Stackable filesystems 22 + config FS_STACK 23 + bool 24 + 21 25 config BUFFER_HEAD 22 26 bool 23 27
+1
fs/Makefile
··· 39 39 obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 40 40 obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 41 41 42 + obj-$(CONFIG_FS_STACK) += backing-file.o 42 43 obj-$(CONFIG_FS_MBCACHE) += mbcache.o 43 44 obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o 44 45 obj-$(CONFIG_NFS_COMMON) += nfs_common/
+336
fs/backing-file.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Common helpers for stackable filesystems and backing files. 4 + * 5 + * Forked from fs/overlayfs/file.c. 6 + * 7 + * Copyright (C) 2017 Red Hat, Inc. 8 + * Copyright (C) 2023 CTERA Networks. 9 + */ 10 + 11 + #include <linux/fs.h> 12 + #include <linux/backing-file.h> 13 + #include <linux/splice.h> 14 + #include <linux/mm.h> 15 + 16 + #include "internal.h" 17 + 18 + /** 19 + * backing_file_open - open a backing file for kernel internal use 20 + * @user_path: path that the user reuqested to open 21 + * @flags: open flags 22 + * @real_path: path of the backing file 23 + * @cred: credentials for open 24 + * 25 + * Open a backing file for a stackable filesystem (e.g., overlayfs). 26 + * @user_path may be on the stackable filesystem and @real_path on the 27 + * underlying filesystem. In this case, we want to be able to return the 28 + * @user_path of the stackable filesystem. This is done by embedding the 29 + * returned file into a container structure that also stores the stacked 30 + * file's path, which can be retrieved using backing_file_user_path(). 31 + */ 32 + struct file *backing_file_open(const struct path *user_path, int flags, 33 + const struct path *real_path, 34 + const struct cred *cred) 35 + { 36 + struct file *f; 37 + int error; 38 + 39 + f = alloc_empty_backing_file(flags, cred); 40 + if (IS_ERR(f)) 41 + return f; 42 + 43 + path_get(user_path); 44 + *backing_file_user_path(f) = *user_path; 45 + error = vfs_open(real_path, f); 46 + if (error) { 47 + fput(f); 48 + f = ERR_PTR(error); 49 + } 50 + 51 + return f; 52 + } 53 + EXPORT_SYMBOL_GPL(backing_file_open); 54 + 55 + struct backing_aio { 56 + struct kiocb iocb; 57 + refcount_t ref; 58 + struct kiocb *orig_iocb; 59 + /* used for aio completion */ 60 + void (*end_write)(struct file *); 61 + struct work_struct work; 62 + long res; 63 + }; 64 + 65 + static struct kmem_cache *backing_aio_cachep; 66 + 67 + #define BACKING_IOCB_MASK \ 68 + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) 69 + 70 + static rwf_t iocb_to_rw_flags(int flags) 71 + { 72 + return (__force rwf_t)(flags & BACKING_IOCB_MASK); 73 + } 74 + 75 + static void backing_aio_put(struct backing_aio *aio) 76 + { 77 + if (refcount_dec_and_test(&aio->ref)) { 78 + fput(aio->iocb.ki_filp); 79 + kmem_cache_free(backing_aio_cachep, aio); 80 + } 81 + } 82 + 83 + static void backing_aio_cleanup(struct backing_aio *aio, long res) 84 + { 85 + struct kiocb *iocb = &aio->iocb; 86 + struct kiocb *orig_iocb = aio->orig_iocb; 87 + 88 + if (aio->end_write) 89 + aio->end_write(orig_iocb->ki_filp); 90 + 91 + orig_iocb->ki_pos = iocb->ki_pos; 92 + backing_aio_put(aio); 93 + } 94 + 95 + static void backing_aio_rw_complete(struct kiocb *iocb, long res) 96 + { 97 + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); 98 + struct kiocb *orig_iocb = aio->orig_iocb; 99 + 100 + if (iocb->ki_flags & IOCB_WRITE) 101 + kiocb_end_write(iocb); 102 + 103 + backing_aio_cleanup(aio, res); 104 + orig_iocb->ki_complete(orig_iocb, res); 105 + } 106 + 107 + static void backing_aio_complete_work(struct work_struct *work) 108 + { 109 + struct backing_aio *aio = container_of(work, struct backing_aio, work); 110 + 111 + backing_aio_rw_complete(&aio->iocb, aio->res); 112 + } 113 + 114 + static void backing_aio_queue_completion(struct kiocb *iocb, long res) 115 + { 116 + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); 117 + 118 + /* 119 + * Punt to a work queue to serialize updates of mtime/size. 120 + */ 121 + aio->res = res; 122 + INIT_WORK(&aio->work, backing_aio_complete_work); 123 + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, 124 + &aio->work); 125 + } 126 + 127 + static int backing_aio_init_wq(struct kiocb *iocb) 128 + { 129 + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; 130 + 131 + if (sb->s_dio_done_wq) 132 + return 0; 133 + 134 + return sb_init_dio_done_wq(sb); 135 + } 136 + 137 + 138 + ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, 139 + struct kiocb *iocb, int flags, 140 + struct backing_file_ctx *ctx) 141 + { 142 + struct backing_aio *aio = NULL; 143 + const struct cred *old_cred; 144 + ssize_t ret; 145 + 146 + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) 147 + return -EIO; 148 + 149 + if (!iov_iter_count(iter)) 150 + return 0; 151 + 152 + if (iocb->ki_flags & IOCB_DIRECT && 153 + !(file->f_mode & FMODE_CAN_ODIRECT)) 154 + return -EINVAL; 155 + 156 + old_cred = override_creds(ctx->cred); 157 + if (is_sync_kiocb(iocb)) { 158 + rwf_t rwf = iocb_to_rw_flags(flags); 159 + 160 + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); 161 + } else { 162 + ret = -ENOMEM; 163 + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); 164 + if (!aio) 165 + goto out; 166 + 167 + aio->orig_iocb = iocb; 168 + kiocb_clone(&aio->iocb, iocb, get_file(file)); 169 + aio->iocb.ki_complete = backing_aio_rw_complete; 170 + refcount_set(&aio->ref, 2); 171 + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); 172 + backing_aio_put(aio); 173 + if (ret != -EIOCBQUEUED) 174 + backing_aio_cleanup(aio, ret); 175 + } 176 + out: 177 + revert_creds(old_cred); 178 + 179 + if (ctx->accessed) 180 + ctx->accessed(ctx->user_file); 181 + 182 + return ret; 183 + } 184 + EXPORT_SYMBOL_GPL(backing_file_read_iter); 185 + 186 + ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, 187 + struct kiocb *iocb, int flags, 188 + struct backing_file_ctx *ctx) 189 + { 190 + const struct cred *old_cred; 191 + ssize_t ret; 192 + 193 + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) 194 + return -EIO; 195 + 196 + if (!iov_iter_count(iter)) 197 + return 0; 198 + 199 + ret = file_remove_privs(ctx->user_file); 200 + if (ret) 201 + return ret; 202 + 203 + if (iocb->ki_flags & IOCB_DIRECT && 204 + !(file->f_mode & FMODE_CAN_ODIRECT)) 205 + return -EINVAL; 206 + 207 + /* 208 + * Stacked filesystems don't support deferred completions, don't copy 209 + * this property in case it is set by the issuer. 210 + */ 211 + flags &= ~IOCB_DIO_CALLER_COMP; 212 + 213 + old_cred = override_creds(ctx->cred); 214 + if (is_sync_kiocb(iocb)) { 215 + rwf_t rwf = iocb_to_rw_flags(flags); 216 + 217 + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); 218 + if (ctx->end_write) 219 + ctx->end_write(ctx->user_file); 220 + } else { 221 + struct backing_aio *aio; 222 + 223 + ret = backing_aio_init_wq(iocb); 224 + if (ret) 225 + goto out; 226 + 227 + ret = -ENOMEM; 228 + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); 229 + if (!aio) 230 + goto out; 231 + 232 + aio->orig_iocb = iocb; 233 + aio->end_write = ctx->end_write; 234 + kiocb_clone(&aio->iocb, iocb, get_file(file)); 235 + aio->iocb.ki_flags = flags; 236 + aio->iocb.ki_complete = backing_aio_queue_completion; 237 + refcount_set(&aio->ref, 2); 238 + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); 239 + backing_aio_put(aio); 240 + if (ret != -EIOCBQUEUED) 241 + backing_aio_cleanup(aio, ret); 242 + } 243 + out: 244 + revert_creds(old_cred); 245 + 246 + return ret; 247 + } 248 + EXPORT_SYMBOL_GPL(backing_file_write_iter); 249 + 250 + ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, 251 + struct pipe_inode_info *pipe, size_t len, 252 + unsigned int flags, 253 + struct backing_file_ctx *ctx) 254 + { 255 + const struct cred *old_cred; 256 + ssize_t ret; 257 + 258 + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) 259 + return -EIO; 260 + 261 + old_cred = override_creds(ctx->cred); 262 + ret = vfs_splice_read(in, ppos, pipe, len, flags); 263 + revert_creds(old_cred); 264 + 265 + if (ctx->accessed) 266 + ctx->accessed(ctx->user_file); 267 + 268 + return ret; 269 + } 270 + EXPORT_SYMBOL_GPL(backing_file_splice_read); 271 + 272 + ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, 273 + struct file *out, loff_t *ppos, size_t len, 274 + unsigned int flags, 275 + struct backing_file_ctx *ctx) 276 + { 277 + const struct cred *old_cred; 278 + ssize_t ret; 279 + 280 + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) 281 + return -EIO; 282 + 283 + ret = file_remove_privs(ctx->user_file); 284 + if (ret) 285 + return ret; 286 + 287 + old_cred = override_creds(ctx->cred); 288 + file_start_write(out); 289 + ret = iter_file_splice_write(pipe, out, ppos, len, flags); 290 + file_end_write(out); 291 + revert_creds(old_cred); 292 + 293 + if (ctx->end_write) 294 + ctx->end_write(ctx->user_file); 295 + 296 + return ret; 297 + } 298 + EXPORT_SYMBOL_GPL(backing_file_splice_write); 299 + 300 + int backing_file_mmap(struct file *file, struct vm_area_struct *vma, 301 + struct backing_file_ctx *ctx) 302 + { 303 + const struct cred *old_cred; 304 + int ret; 305 + 306 + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || 307 + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) 308 + return -EIO; 309 + 310 + if (!file->f_op->mmap) 311 + return -ENODEV; 312 + 313 + vma_set_file(vma, file); 314 + 315 + old_cred = override_creds(ctx->cred); 316 + ret = call_mmap(vma->vm_file, vma); 317 + revert_creds(old_cred); 318 + 319 + if (ctx->accessed) 320 + ctx->accessed(ctx->user_file); 321 + 322 + return ret; 323 + } 324 + EXPORT_SYMBOL_GPL(backing_file_mmap); 325 + 326 + static int __init backing_aio_init(void) 327 + { 328 + backing_aio_cachep = kmem_cache_create("backing_aio", 329 + sizeof(struct backing_aio), 330 + 0, SLAB_HWCACHE_ALIGN, NULL); 331 + if (!backing_aio_cachep) 332 + return -ENOMEM; 333 + 334 + return 0; 335 + } 336 + fs_initcall(backing_aio_init);
+6 -6
fs/btrfs/ioctl.c
··· 4533 4533 if (ret < 0) 4534 4534 goto out_acct; 4535 4535 4536 - file_start_write(file); 4537 - 4538 4536 if (iov_iter_count(&iter) == 0) { 4539 4537 ret = 0; 4540 - goto out_end_write; 4538 + goto out_iov; 4541 4539 } 4542 4540 pos = args.offset; 4543 4541 ret = rw_verify_area(WRITE, file, &pos, args.len); 4544 4542 if (ret < 0) 4545 - goto out_end_write; 4543 + goto out_iov; 4546 4544 4547 4545 init_sync_kiocb(&kiocb, file); 4548 4546 ret = kiocb_set_rw_flags(&kiocb, 0); 4549 4547 if (ret) 4550 - goto out_end_write; 4548 + goto out_iov; 4551 4549 kiocb.ki_pos = pos; 4550 + 4551 + file_start_write(file); 4552 4552 4553 4553 ret = btrfs_do_write_iter(&kiocb, &iter, &args); 4554 4554 if (ret > 0) 4555 4555 fsnotify_modify(file); 4556 4556 4557 - out_end_write: 4558 4557 file_end_write(file); 4558 + out_iov: 4559 4559 kfree(iov); 4560 4560 out_acct: 4561 4561 if (ret > 0)
+2 -3
fs/cachefiles/io.c
··· 259 259 260 260 _enter("%ld", ret); 261 261 262 - kiocb_end_write(iocb); 262 + if (ki->was_async) 263 + kiocb_end_write(iocb); 263 264 264 265 if (ret < 0) 265 266 trace_cachefiles_io_error(object, inode, ret, ··· 319 318 if (ki->term_func) 320 319 ki->iocb.ki_complete = cachefiles_write_complete; 321 320 atomic_long_add(ki->b_writing, &cache->b_writing); 322 - 323 - kiocb_start_write(&ki->iocb); 324 321 325 322 get_file(ki->iocb.ki_filp); 326 323 cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
+7 -6
fs/ceph/file.c
··· 12 12 #include <linux/falloc.h> 13 13 #include <linux/iversion.h> 14 14 #include <linux/ktime.h> 15 + #include <linux/splice.h> 15 16 16 17 #include "super.h" 17 18 #include "mds_client.h" ··· 3011 3010 * {read,write}_iter, which will get caps again. 3012 3011 */ 3013 3012 put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 3014 - ret = do_splice_direct(src_file, &src_off, dst_file, 3015 - &dst_off, src_objlen, flags); 3013 + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, 3014 + src_objlen); 3016 3015 /* Abort on short copies or on error */ 3017 3016 if (ret < (long)src_objlen) { 3018 3017 doutc(cl, "Failed partial copy (%zd)\n", ret); ··· 3066 3065 */ 3067 3066 if (len && (len < src_ci->i_layout.object_size)) { 3068 3067 doutc(cl, "Final partial copy of %zu bytes\n", len); 3069 - bytes = do_splice_direct(src_file, &src_off, dst_file, 3070 - &dst_off, len, flags); 3068 + bytes = splice_file_range(src_file, &src_off, dst_file, 3069 + &dst_off, len); 3071 3070 if (bytes > 0) 3072 3071 ret += bytes; 3073 3072 else ··· 3090 3089 len, flags); 3091 3090 3092 3091 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3093 - ret = generic_copy_file_range(src_file, src_off, dst_file, 3094 - dst_off, len, flags); 3092 + ret = splice_copy_file_range(src_file, src_off, dst_file, 3093 + dst_off, len); 3095 3094 return ret; 3096 3095 } 3097 3096
-2
fs/coda/file.c
··· 79 79 if (ret) 80 80 goto finish_write; 81 81 82 - file_start_write(host_file); 83 82 inode_lock(coda_inode); 84 83 ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); 85 84 coda_inode->i_size = file_inode(host_file)->i_size; 86 85 coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; 87 86 inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); 88 87 inode_unlock(coda_inode); 89 - file_end_write(host_file); 90 88 91 89 finish_write: 92 90 venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
+3 -2
fs/fuse/file.c
··· 19 19 #include <linux/uio.h> 20 20 #include <linux/fs.h> 21 21 #include <linux/filelock.h> 22 + #include <linux/splice.h> 22 23 23 24 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, 24 25 unsigned int open_flags, int opcode, ··· 3196 3195 len, flags); 3197 3196 3198 3197 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3199 - ret = generic_copy_file_range(src_file, src_off, dst_file, 3200 - dst_off, len, flags); 3198 + ret = splice_copy_file_range(src_file, src_off, dst_file, 3199 + dst_off, len); 3201 3200 return ret; 3202 3201 } 3203 3202
+4 -4
fs/internal.h
··· 244 244 /* 245 245 * fs/splice.c: 246 246 */ 247 - long splice_file_to_pipe(struct file *in, 248 - struct pipe_inode_info *opipe, 249 - loff_t *offset, 250 - size_t len, unsigned int flags); 247 + ssize_t splice_file_to_pipe(struct file *in, 248 + struct pipe_inode_info *opipe, 249 + loff_t *offset, 250 + size_t len, unsigned int flags); 251 251 252 252 /* 253 253 * fs/xattr.c:
+3 -2
fs/nfs/nfs4file.c
··· 10 10 #include <linux/mount.h> 11 11 #include <linux/nfs_fs.h> 12 12 #include <linux/nfs_ssc.h> 13 + #include <linux/splice.h> 13 14 #include "delegation.h" 14 15 #include "internal.h" 15 16 #include "iostat.h" ··· 196 195 ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, 197 196 flags); 198 197 if (ret == -EOPNOTSUPP || ret == -EXDEV) 199 - ret = generic_copy_file_range(file_in, pos_in, file_out, 200 - pos_out, count, flags); 198 + ret = splice_copy_file_range(file_in, pos_in, file_out, 199 + pos_out, count); 201 200 return ret; 202 201 } 203 202
+4 -3
fs/nfsd/vfs.c
··· 1039 1039 ssize_t host_err; 1040 1040 1041 1041 trace_nfsd_read_splice(rqstp, fhp, offset, *count); 1042 - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); 1042 + host_err = rw_verify_area(READ, file, &offset, *count); 1043 + if (!host_err) 1044 + host_err = splice_direct_to_actor(file, &sd, 1045 + nfsd_direct_splice_actor); 1043 1046 return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); 1044 1047 } 1045 1048 ··· 1179 1176 since = READ_ONCE(file->f_wb_err); 1180 1177 if (verf) 1181 1178 nfsd_copy_write_verifier(verf, nn); 1182 - file_start_write(file); 1183 1179 host_err = vfs_iter_write(file, &iter, &pos, flags); 1184 - file_end_write(file); 1185 1180 if (host_err < 0) { 1186 1181 commit_reset_write_verifier(nn, rqstp, host_err); 1187 1182 goto out_nfserr;
+4 -38
fs/open.c
··· 304 304 if (ret) 305 305 return ret; 306 306 307 + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); 308 + if (ret) 309 + return ret; 310 + 307 311 if (S_ISFIFO(inode->i_mode)) 308 312 return -ESPIPE; 309 313 ··· 1181 1177 return f; 1182 1178 } 1183 1179 EXPORT_SYMBOL_GPL(kernel_file_open); 1184 - 1185 - /** 1186 - * backing_file_open - open a backing file for kernel internal use 1187 - * @user_path: path that the user reuqested to open 1188 - * @flags: open flags 1189 - * @real_path: path of the backing file 1190 - * @cred: credentials for open 1191 - * 1192 - * Open a backing file for a stackable filesystem (e.g., overlayfs). 1193 - * @user_path may be on the stackable filesystem and @real_path on the 1194 - * underlying filesystem. In this case, we want to be able to return the 1195 - * @user_path of the stackable filesystem. This is done by embedding the 1196 - * returned file into a container structure that also stores the stacked 1197 - * file's path, which can be retrieved using backing_file_user_path(). 1198 - */ 1199 - struct file *backing_file_open(const struct path *user_path, int flags, 1200 - const struct path *real_path, 1201 - const struct cred *cred) 1202 - { 1203 - struct file *f; 1204 - int error; 1205 - 1206 - f = alloc_empty_backing_file(flags, cred); 1207 - if (IS_ERR(f)) 1208 - return f; 1209 - 1210 - path_get(user_path); 1211 - *backing_file_user_path(f) = *user_path; 1212 - f->f_path = *real_path; 1213 - error = do_dentry_open(f, d_inode(real_path->dentry), NULL); 1214 - if (error) { 1215 - fput(f); 1216 - f = ERR_PTR(error); 1217 - } 1218 - 1219 - return f; 1220 - } 1221 - EXPORT_SYMBOL_GPL(backing_file_open); 1222 1180 1223 1181 #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) 1224 1182 #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
+1
fs/overlayfs/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config OVERLAY_FS 3 3 tristate "Overlay filesystem support" 4 + select FS_STACK 4 5 select EXPORTFS 5 6 help 6 7 An overlay filesystem combines two filesystems - an 'upper' filesystem
+26 -4
fs/overlayfs/copy_up.c
··· 230 230 return ovl_real_fileattr_set(new, &newfa); 231 231 } 232 232 233 + static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) 234 + { 235 + loff_t tmp; 236 + 237 + if (WARN_ON_ONCE(pos != pos2)) 238 + return -EIO; 239 + if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0)) 240 + return -EIO; 241 + if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp))) 242 + return -EIO; 243 + return 0; 244 + } 245 + 233 246 static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, 234 247 struct file *new_file, loff_t len) 235 248 { ··· 257 244 int error = 0; 258 245 259 246 ovl_path_lowerdata(dentry, &datapath); 260 - if (WARN_ON(datapath.dentry == NULL)) 247 + if (WARN_ON_ONCE(datapath.dentry == NULL) || 248 + WARN_ON_ONCE(len < 0)) 261 249 return -EIO; 262 250 263 251 old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); 264 252 if (IS_ERR(old_file)) 265 253 return PTR_ERR(old_file); 254 + 255 + error = rw_verify_area(READ, old_file, &old_pos, len); 256 + if (!error) 257 + error = rw_verify_area(WRITE, new_file, &new_pos, len); 258 + if (error) 259 + goto out_fput; 266 260 267 261 /* Try to use clone_file_range to clone up within the same fs */ 268 262 ovl_start_write(dentry); ··· 285 265 286 266 while (len) { 287 267 size_t this_len = OVL_COPY_UP_CHUNK_SIZE; 288 - long bytes; 268 + ssize_t bytes; 289 269 290 270 if (len < this_len) 291 271 this_len = len; ··· 329 309 } 330 310 } 331 311 332 - ovl_start_write(dentry); 312 + error = ovl_verify_area(old_pos, new_pos, this_len, len); 313 + if (error) 314 + break; 315 + 333 316 bytes = do_splice_direct(old_file, &old_pos, 334 317 new_file, &new_pos, 335 318 this_len, SPLICE_F_MOVE); 336 - ovl_end_write(dentry); 337 319 if (bytes <= 0) { 338 320 error = bytes; 339 321 break;
+33 -214
fs/overlayfs/file.c
··· 9 9 #include <linux/xattr.h> 10 10 #include <linux/uio.h> 11 11 #include <linux/uaccess.h> 12 - #include <linux/splice.h> 13 12 #include <linux/security.h> 14 - #include <linux/mm.h> 15 13 #include <linux/fs.h> 14 + #include <linux/backing-file.h> 16 15 #include "overlayfs.h" 17 - 18 - #include "../internal.h" /* for sb_init_dio_done_wq */ 19 - 20 - struct ovl_aio_req { 21 - struct kiocb iocb; 22 - refcount_t ref; 23 - struct kiocb *orig_iocb; 24 - /* used for aio completion */ 25 - struct work_struct work; 26 - long res; 27 - }; 28 - 29 - static struct kmem_cache *ovl_aio_request_cachep; 30 16 31 17 static char ovl_whatisit(struct inode *inode, struct inode *realinode) 32 18 { ··· 260 274 touch_atime(&file->f_path); 261 275 } 262 276 263 - #define OVL_IOCB_MASK \ 264 - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) 265 - 266 - static rwf_t iocb_to_rw_flags(int flags) 267 - { 268 - return (__force rwf_t)(flags & OVL_IOCB_MASK); 269 - } 270 - 271 - static inline void ovl_aio_put(struct ovl_aio_req *aio_req) 272 - { 273 - if (refcount_dec_and_test(&aio_req->ref)) { 274 - fput(aio_req->iocb.ki_filp); 275 - kmem_cache_free(ovl_aio_request_cachep, aio_req); 276 - } 277 - } 278 - 279 - static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) 280 - { 281 - struct kiocb *iocb = &aio_req->iocb; 282 - struct kiocb *orig_iocb = aio_req->orig_iocb; 283 - 284 - if (iocb->ki_flags & IOCB_WRITE) { 285 - kiocb_end_write(iocb); 286 - ovl_file_modified(orig_iocb->ki_filp); 287 - } 288 - 289 - orig_iocb->ki_pos = iocb->ki_pos; 290 - ovl_aio_put(aio_req); 291 - } 292 - 293 - static void ovl_aio_rw_complete(struct kiocb *iocb, long res) 294 - { 295 - struct ovl_aio_req *aio_req = container_of(iocb, 296 - struct ovl_aio_req, iocb); 297 - struct kiocb *orig_iocb = aio_req->orig_iocb; 298 - 299 - ovl_aio_cleanup_handler(aio_req); 300 - orig_iocb->ki_complete(orig_iocb, res); 301 - } 302 - 303 - static void ovl_aio_complete_work(struct work_struct *work) 304 - { 305 - struct ovl_aio_req *aio_req = container_of(work, 306 - struct ovl_aio_req, work); 307 - 308 - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); 309 - } 310 - 311 - static void ovl_aio_queue_completion(struct kiocb *iocb, long res) 312 - { 313 - struct ovl_aio_req *aio_req = container_of(iocb, 314 - struct ovl_aio_req, iocb); 315 - struct kiocb *orig_iocb = aio_req->orig_iocb; 316 - 317 - /* 318 - * Punt to a work queue to serialize updates of mtime/size. 319 - */ 320 - aio_req->res = res; 321 - INIT_WORK(&aio_req->work, ovl_aio_complete_work); 322 - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, 323 - &aio_req->work); 324 - } 325 - 326 - static int ovl_init_aio_done_wq(struct super_block *sb) 327 - { 328 - if (sb->s_dio_done_wq) 329 - return 0; 330 - 331 - return sb_init_dio_done_wq(sb); 332 - } 333 - 334 277 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) 335 278 { 336 279 struct file *file = iocb->ki_filp; 337 280 struct fd real; 338 - const struct cred *old_cred; 339 281 ssize_t ret; 282 + struct backing_file_ctx ctx = { 283 + .cred = ovl_creds(file_inode(file)->i_sb), 284 + .user_file = file, 285 + .accessed = ovl_file_accessed, 286 + }; 340 287 341 288 if (!iov_iter_count(iter)) 342 289 return 0; ··· 278 359 if (ret) 279 360 return ret; 280 361 281 - ret = -EINVAL; 282 - if (iocb->ki_flags & IOCB_DIRECT && 283 - !(real.file->f_mode & FMODE_CAN_ODIRECT)) 284 - goto out_fdput; 285 - 286 - old_cred = ovl_override_creds(file_inode(file)->i_sb); 287 - if (is_sync_kiocb(iocb)) { 288 - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); 289 - 290 - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); 291 - } else { 292 - struct ovl_aio_req *aio_req; 293 - 294 - ret = -ENOMEM; 295 - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); 296 - if (!aio_req) 297 - goto out; 298 - 299 - aio_req->orig_iocb = iocb; 300 - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); 301 - aio_req->iocb.ki_complete = ovl_aio_rw_complete; 302 - refcount_set(&aio_req->ref, 2); 303 - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); 304 - ovl_aio_put(aio_req); 305 - if (ret != -EIOCBQUEUED) 306 - ovl_aio_cleanup_handler(aio_req); 307 - } 308 - out: 309 - revert_creds(old_cred); 310 - ovl_file_accessed(file); 311 - out_fdput: 362 + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, 363 + &ctx); 312 364 fdput(real); 313 365 314 366 return ret; ··· 290 400 struct file *file = iocb->ki_filp; 291 401 struct inode *inode = file_inode(file); 292 402 struct fd real; 293 - const struct cred *old_cred; 294 403 ssize_t ret; 295 404 int ifl = iocb->ki_flags; 405 + struct backing_file_ctx ctx = { 406 + .cred = ovl_creds(inode->i_sb), 407 + .user_file = file, 408 + .end_write = ovl_file_modified, 409 + }; 296 410 297 411 if (!iov_iter_count(iter)) 298 412 return 0; ··· 304 410 inode_lock(inode); 305 411 /* Update mode */ 306 412 ovl_copyattr(inode); 307 - ret = file_remove_privs(file); 308 - if (ret) 309 - goto out_unlock; 310 413 311 414 ret = ovl_real_fdget(file, &real); 312 415 if (ret) 313 416 goto out_unlock; 314 - 315 - ret = -EINVAL; 316 - if (iocb->ki_flags & IOCB_DIRECT && 317 - !(real.file->f_mode & FMODE_CAN_ODIRECT)) 318 - goto out_fdput; 319 417 320 418 if (!ovl_should_sync(OVL_FS(inode->i_sb))) 321 419 ifl &= ~(IOCB_DSYNC | IOCB_SYNC); ··· 317 431 * this property in case it is set by the issuer. 318 432 */ 319 433 ifl &= ~IOCB_DIO_CALLER_COMP; 320 - 321 - old_cred = ovl_override_creds(file_inode(file)->i_sb); 322 - if (is_sync_kiocb(iocb)) { 323 - rwf_t rwf = iocb_to_rw_flags(ifl); 324 - 325 - file_start_write(real.file); 326 - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); 327 - file_end_write(real.file); 328 - /* Update size */ 329 - ovl_file_modified(file); 330 - } else { 331 - struct ovl_aio_req *aio_req; 332 - 333 - ret = ovl_init_aio_done_wq(inode->i_sb); 334 - if (ret) 335 - goto out; 336 - 337 - ret = -ENOMEM; 338 - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); 339 - if (!aio_req) 340 - goto out; 341 - 342 - aio_req->orig_iocb = iocb; 343 - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); 344 - aio_req->iocb.ki_flags = ifl; 345 - aio_req->iocb.ki_complete = ovl_aio_queue_completion; 346 - refcount_set(&aio_req->ref, 2); 347 - kiocb_start_write(&aio_req->iocb); 348 - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); 349 - ovl_aio_put(aio_req); 350 - if (ret != -EIOCBQUEUED) 351 - ovl_aio_cleanup_handler(aio_req); 352 - } 353 - out: 354 - revert_creds(old_cred); 355 - out_fdput: 434 + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); 356 435 fdput(real); 357 436 358 437 out_unlock: ··· 330 479 struct pipe_inode_info *pipe, size_t len, 331 480 unsigned int flags) 332 481 { 333 - const struct cred *old_cred; 334 482 struct fd real; 335 483 ssize_t ret; 484 + struct backing_file_ctx ctx = { 485 + .cred = ovl_creds(file_inode(in)->i_sb), 486 + .user_file = in, 487 + .accessed = ovl_file_accessed, 488 + }; 336 489 337 490 ret = ovl_real_fdget(in, &real); 338 491 if (ret) 339 492 return ret; 340 493 341 - old_cred = ovl_override_creds(file_inode(in)->i_sb); 342 - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); 343 - revert_creds(old_cred); 344 - ovl_file_accessed(in); 345 - 494 + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); 346 495 fdput(real); 496 + 347 497 return ret; 348 498 } 349 499 ··· 360 508 loff_t *ppos, size_t len, unsigned int flags) 361 509 { 362 510 struct fd real; 363 - const struct cred *old_cred; 364 511 struct inode *inode = file_inode(out); 365 512 ssize_t ret; 513 + struct backing_file_ctx ctx = { 514 + .cred = ovl_creds(inode->i_sb), 515 + .user_file = out, 516 + .end_write = ovl_file_modified, 517 + }; 366 518 367 519 inode_lock(inode); 368 520 /* Update mode */ 369 521 ovl_copyattr(inode); 370 - ret = file_remove_privs(out); 371 - if (ret) 372 - goto out_unlock; 373 522 374 523 ret = ovl_real_fdget(out, &real); 375 524 if (ret) 376 525 goto out_unlock; 377 526 378 - old_cred = ovl_override_creds(inode->i_sb); 379 - file_start_write(real.file); 380 - 381 - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); 382 - 383 - file_end_write(real.file); 384 - /* Update size */ 385 - ovl_file_modified(out); 386 - revert_creds(old_cred); 527 + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); 387 528 fdput(real); 388 529 389 530 out_unlock: ··· 414 569 static int ovl_mmap(struct file *file, struct vm_area_struct *vma) 415 570 { 416 571 struct file *realfile = file->private_data; 417 - const struct cred *old_cred; 418 - int ret; 572 + struct backing_file_ctx ctx = { 573 + .cred = ovl_creds(file_inode(file)->i_sb), 574 + .user_file = file, 575 + .accessed = ovl_file_accessed, 576 + }; 419 577 420 - if (!realfile->f_op->mmap) 421 - return -ENODEV; 422 - 423 - if (WARN_ON(file != vma->vm_file)) 424 - return -EIO; 425 - 426 - vma_set_file(vma, realfile); 427 - 428 - old_cred = ovl_override_creds(file_inode(file)->i_sb); 429 - ret = call_mmap(vma->vm_file, vma); 430 - revert_creds(old_cred); 431 - ovl_file_accessed(file); 432 - 433 - return ret; 578 + return backing_file_mmap(realfile, vma, &ctx); 434 579 } 435 580 436 581 static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ··· 613 778 .copy_file_range = ovl_copy_file_range, 614 779 .remap_file_range = ovl_remap_file_range, 615 780 }; 616 - 617 - int __init ovl_aio_request_cache_init(void) 618 - { 619 - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", 620 - sizeof(struct ovl_aio_req), 621 - 0, SLAB_HWCACHE_ALIGN, NULL); 622 - if (!ovl_aio_request_cachep) 623 - return -ENOMEM; 624 - 625 - return 0; 626 - } 627 - 628 - void ovl_aio_request_cache_destroy(void) 629 - { 630 - kmem_cache_destroy(ovl_aio_request_cachep); 631 - }
+6 -2
fs/overlayfs/overlayfs.h
··· 425 425 void ovl_drop_write(struct dentry *dentry); 426 426 struct dentry *ovl_workdir(struct dentry *dentry); 427 427 const struct cred *ovl_override_creds(struct super_block *sb); 428 + 429 + static inline const struct cred *ovl_creds(struct super_block *sb) 430 + { 431 + return OVL_FS(sb)->creator_cred; 432 + } 433 + 428 434 int ovl_can_decode_fh(struct super_block *sb); 429 435 struct dentry *ovl_indexdir(struct super_block *sb); 430 436 bool ovl_index_all(struct super_block *sb); ··· 843 837 844 838 /* file.c */ 845 839 extern const struct file_operations ovl_file_operations; 846 - int __init ovl_aio_request_cache_init(void); 847 - void ovl_aio_request_cache_destroy(void); 848 840 int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); 849 841 int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); 850 842 int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+3 -8
fs/overlayfs/super.c
··· 1501 1501 if (ovl_inode_cachep == NULL) 1502 1502 return -ENOMEM; 1503 1503 1504 - err = ovl_aio_request_cache_init(); 1505 - if (!err) { 1506 - err = register_filesystem(&ovl_fs_type); 1507 - if (!err) 1508 - return 0; 1504 + err = register_filesystem(&ovl_fs_type); 1505 + if (!err) 1506 + return 0; 1509 1507 1510 - ovl_aio_request_cache_destroy(); 1511 - } 1512 1508 kmem_cache_destroy(ovl_inode_cachep); 1513 1509 1514 1510 return err; ··· 1520 1524 */ 1521 1525 rcu_barrier(); 1522 1526 kmem_cache_destroy(ovl_inode_cachep); 1523 - ovl_aio_request_cache_destroy(); 1524 1527 } 1525 1528 1526 1529 module_init(ovl_init);
+133 -116
fs/read_write.c
··· 354 354 355 355 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 356 356 { 357 + int mask = read_write == READ ? MAY_READ : MAY_WRITE; 358 + int ret; 359 + 357 360 if (unlikely((ssize_t) count < 0)) 358 361 return -EINVAL; 359 362 ··· 374 371 } 375 372 } 376 373 377 - return security_file_permission(file, 378 - read_write == READ ? MAY_READ : MAY_WRITE); 374 + ret = security_file_permission(file, mask); 375 + if (ret) 376 + return ret; 377 + 378 + return fsnotify_file_area_perm(file, mask, ppos, count); 379 379 } 380 380 EXPORT_SYMBOL(rw_verify_area); 381 381 ··· 779 773 return ret; 780 774 } 781 775 782 - static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, 783 - loff_t *pos, rwf_t flags) 784 - { 785 - size_t tot_len; 786 - ssize_t ret = 0; 787 - 788 - if (!(file->f_mode & FMODE_READ)) 789 - return -EBADF; 790 - if (!(file->f_mode & FMODE_CAN_READ)) 791 - return -EINVAL; 792 - 793 - tot_len = iov_iter_count(iter); 794 - if (!tot_len) 795 - goto out; 796 - ret = rw_verify_area(READ, file, pos, tot_len); 797 - if (ret < 0) 798 - return ret; 799 - 800 - if (file->f_op->read_iter) 801 - ret = do_iter_readv_writev(file, iter, pos, READ, flags); 802 - else 803 - ret = do_loop_readv_writev(file, iter, pos, READ, flags); 804 - out: 805 - if (ret >= 0) 806 - fsnotify_access(file); 807 - return ret; 808 - } 809 - 810 776 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, 811 777 struct iov_iter *iter) 812 778 { ··· 808 830 EXPORT_SYMBOL(vfs_iocb_iter_read); 809 831 810 832 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 811 - rwf_t flags) 812 - { 813 - if (!file->f_op->read_iter) 814 - return -EINVAL; 815 - return do_iter_read(file, iter, ppos, flags); 816 - } 817 - EXPORT_SYMBOL(vfs_iter_read); 818 - 819 - static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, 820 - loff_t *pos, rwf_t flags) 833 + rwf_t flags) 821 834 { 822 835 size_t tot_len; 823 836 ssize_t ret = 0; 824 837 825 - if (!(file->f_mode & FMODE_WRITE)) 838 + if (!file->f_op->read_iter) 839 + return -EINVAL; 840 + if (!(file->f_mode & FMODE_READ)) 826 841 return -EBADF; 827 - if (!(file->f_mode & FMODE_CAN_WRITE)) 842 + if (!(file->f_mode & FMODE_CAN_READ)) 828 843 return -EINVAL; 829 844 830 845 tot_len = iov_iter_count(iter); 831 846 if (!tot_len) 832 - return 0; 833 - ret = rw_verify_area(WRITE, file, pos, tot_len); 847 + goto out; 848 + ret = rw_verify_area(READ, file, ppos, tot_len); 834 849 if (ret < 0) 835 850 return ret; 836 851 837 - if (file->f_op->write_iter) 838 - ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); 839 - else 840 - ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); 841 - if (ret > 0) 842 - fsnotify_modify(file); 852 + ret = do_iter_readv_writev(file, iter, ppos, READ, flags); 853 + out: 854 + if (ret >= 0) 855 + fsnotify_access(file); 843 856 return ret; 844 857 } 858 + EXPORT_SYMBOL(vfs_iter_read); 845 859 860 + /* 861 + * Caller is responsible for calling kiocb_end_write() on completion 862 + * if async iocb was queued. 863 + */ 846 864 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, 847 865 struct iov_iter *iter) 848 866 { ··· 859 885 if (ret < 0) 860 886 return ret; 861 887 888 + kiocb_start_write(iocb); 862 889 ret = call_write_iter(file, iocb, iter); 890 + if (ret != -EIOCBQUEUED) 891 + kiocb_end_write(iocb); 863 892 if (ret > 0) 864 893 fsnotify_modify(file); 865 894 ··· 871 894 EXPORT_SYMBOL(vfs_iocb_iter_write); 872 895 873 896 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 874 - rwf_t flags) 897 + rwf_t flags) 875 898 { 899 + size_t tot_len; 900 + ssize_t ret; 901 + 902 + if (!(file->f_mode & FMODE_WRITE)) 903 + return -EBADF; 904 + if (!(file->f_mode & FMODE_CAN_WRITE)) 905 + return -EINVAL; 876 906 if (!file->f_op->write_iter) 877 907 return -EINVAL; 878 - return do_iter_write(file, iter, ppos, flags); 908 + 909 + tot_len = iov_iter_count(iter); 910 + if (!tot_len) 911 + return 0; 912 + 913 + ret = rw_verify_area(WRITE, file, ppos, tot_len); 914 + if (ret < 0) 915 + return ret; 916 + 917 + file_start_write(file); 918 + ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); 919 + if (ret > 0) 920 + fsnotify_modify(file); 921 + file_end_write(file); 922 + 923 + return ret; 879 924 } 880 925 EXPORT_SYMBOL(vfs_iter_write); 881 926 882 927 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 883 - unsigned long vlen, loff_t *pos, rwf_t flags) 928 + unsigned long vlen, loff_t *pos, rwf_t flags) 884 929 { 885 930 struct iovec iovstack[UIO_FASTIOV]; 886 931 struct iovec *iov = iovstack; 887 932 struct iov_iter iter; 888 - ssize_t ret; 933 + size_t tot_len; 934 + ssize_t ret = 0; 889 935 890 - ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 891 - if (ret >= 0) { 892 - ret = do_iter_read(file, &iter, pos, flags); 893 - kfree(iov); 894 - } 936 + if (!(file->f_mode & FMODE_READ)) 937 + return -EBADF; 938 + if (!(file->f_mode & FMODE_CAN_READ)) 939 + return -EINVAL; 895 940 941 + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, 942 + &iter); 943 + if (ret < 0) 944 + return ret; 945 + 946 + tot_len = iov_iter_count(&iter); 947 + if (!tot_len) 948 + goto out; 949 + 950 + ret = rw_verify_area(READ, file, pos, tot_len); 951 + if (ret < 0) 952 + goto out; 953 + 954 + if (file->f_op->read_iter) 955 + ret = do_iter_readv_writev(file, &iter, pos, READ, flags); 956 + else 957 + ret = do_loop_readv_writev(file, &iter, pos, READ, flags); 958 + out: 959 + if (ret >= 0) 960 + fsnotify_access(file); 961 + kfree(iov); 896 962 return ret; 897 963 } 898 964 899 965 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 900 - unsigned long vlen, loff_t *pos, rwf_t flags) 966 + unsigned long vlen, loff_t *pos, rwf_t flags) 901 967 { 902 968 struct iovec iovstack[UIO_FASTIOV]; 903 969 struct iovec *iov = iovstack; 904 970 struct iov_iter iter; 905 - ssize_t ret; 971 + size_t tot_len; 972 + ssize_t ret = 0; 906 973 907 - ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 908 - if (ret >= 0) { 909 - file_start_write(file); 910 - ret = do_iter_write(file, &iter, pos, flags); 911 - file_end_write(file); 912 - kfree(iov); 913 - } 974 + if (!(file->f_mode & FMODE_WRITE)) 975 + return -EBADF; 976 + if (!(file->f_mode & FMODE_CAN_WRITE)) 977 + return -EINVAL; 978 + 979 + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, 980 + &iter); 981 + if (ret < 0) 982 + return ret; 983 + 984 + tot_len = iov_iter_count(&iter); 985 + if (!tot_len) 986 + goto out; 987 + 988 + ret = rw_verify_area(WRITE, file, pos, tot_len); 989 + if (ret < 0) 990 + goto out; 991 + 992 + file_start_write(file); 993 + if (file->f_op->write_iter) 994 + ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); 995 + else 996 + ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); 997 + if (ret > 0) 998 + fsnotify_modify(file); 999 + file_end_write(file); 1000 + out: 1001 + kfree(iov); 914 1002 return ret; 915 1003 } 916 1004 ··· 1220 1178 #endif /* CONFIG_COMPAT */ 1221 1179 1222 1180 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1223 - size_t count, loff_t max) 1181 + size_t count, loff_t max) 1224 1182 { 1225 1183 struct fd in, out; 1226 1184 struct inode *in_inode, *out_inode; ··· 1292 1250 retval = rw_verify_area(WRITE, out.file, &out_pos, count); 1293 1251 if (retval < 0) 1294 1252 goto fput_out; 1295 - file_start_write(out.file); 1296 1253 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, 1297 1254 count, fl); 1298 - file_end_write(out.file); 1299 1255 } else { 1300 1256 if (out.file->f_flags & O_NONBLOCK) 1301 1257 fl |= SPLICE_F_NONBLOCK; ··· 1402 1362 } 1403 1363 #endif 1404 1364 1405 - /** 1406 - * generic_copy_file_range - copy data between two files 1407 - * @file_in: file structure to read from 1408 - * @pos_in: file offset to read from 1409 - * @file_out: file structure to write data to 1410 - * @pos_out: file offset to write data to 1411 - * @len: amount of data to copy 1412 - * @flags: copy flags 1413 - * 1414 - * This is a generic filesystem helper to copy data from one file to another. 1415 - * It has no constraints on the source or destination file owners - the files 1416 - * can belong to different superblocks and different filesystem types. Short 1417 - * copies are allowed. 1418 - * 1419 - * This should be called from the @file_out filesystem, as per the 1420 - * ->copy_file_range() method. 1421 - * 1422 - * Returns the number of bytes copied or a negative error indicating the 1423 - * failure. 1424 - */ 1425 - 1426 - ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, 1427 - struct file *file_out, loff_t pos_out, 1428 - size_t len, unsigned int flags) 1429 - { 1430 - lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); 1431 - 1432 - return do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1433 - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); 1434 - } 1435 - EXPORT_SYMBOL(generic_copy_file_range); 1436 - 1437 1365 /* 1438 1366 * Performs necessary checks before doing a file copy 1439 1367 * ··· 1486 1478 { 1487 1479 ssize_t ret; 1488 1480 bool splice = flags & COPY_FILE_SPLICE; 1481 + bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; 1489 1482 1490 1483 if (flags & ~COPY_FILE_SPLICE) 1491 1484 return -EINVAL; ··· 1518 1509 ret = file_out->f_op->copy_file_range(file_in, pos_in, 1519 1510 file_out, pos_out, 1520 1511 len, flags); 1521 - goto done; 1522 - } 1523 - 1524 - if (!splice && file_in->f_op->remap_file_range && 1525 - file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { 1512 + } else if (!splice && file_in->f_op->remap_file_range && samesb) { 1526 1513 ret = file_in->f_op->remap_file_range(file_in, pos_in, 1527 1514 file_out, pos_out, 1528 1515 min_t(loff_t, MAX_RW_COUNT, len), 1529 1516 REMAP_FILE_CAN_SHORTEN); 1530 - if (ret > 0) 1531 - goto done; 1517 + /* fallback to splice */ 1518 + if (ret <= 0) 1519 + splice = true; 1520 + } else if (samesb) { 1521 + /* Fallback to splice for same sb copy for backward compat */ 1522 + splice = true; 1532 1523 } 1524 + 1525 + file_end_write(file_out); 1526 + 1527 + if (!splice) 1528 + goto done; 1533 1529 1534 1530 /* 1535 1531 * We can get here for same sb copy of filesystems that do not implement ··· 1547 1533 * and which filesystems do not, that will allow userspace tools to 1548 1534 * make consistent desicions w.r.t using copy_file_range(). 1549 1535 * 1550 - * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. 1536 + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE 1537 + * for server-side-copy between any two sb. 1538 + * 1539 + * In any case, we call do_splice_direct() and not splice_file_range(), 1540 + * without file_start_write() held, to avoid possible deadlocks related 1541 + * to splicing from input file, while file_start_write() is held on 1542 + * the output file on a different sb. 1551 1543 */ 1552 - ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, 1553 - flags); 1554 - 1544 + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1545 + min_t(size_t, len, MAX_RW_COUNT), 0); 1555 1546 done: 1556 1547 if (ret > 0) { 1557 1548 fsnotify_access(file_in); ··· 1567 1548 1568 1549 inc_syscr(current); 1569 1550 inc_syscw(current); 1570 - 1571 - file_end_write(file_out); 1572 1551 1573 1552 return ret; 1574 1553 }
+4
fs/readdir.c
··· 96 96 if (res) 97 97 goto out; 98 98 99 + res = fsnotify_file_perm(file, MAY_READ); 100 + if (res) 101 + goto out; 102 + 99 103 res = down_read_killable(&inode->i_rwsem); 100 104 if (res) 101 105 goto out;
+28 -17
fs/remap_range.c
··· 102 102 static int remap_verify_area(struct file *file, loff_t pos, loff_t len, 103 103 bool write) 104 104 { 105 + int mask = write ? MAY_WRITE : MAY_READ; 105 106 loff_t tmp; 107 + int ret; 106 108 107 109 if (unlikely(pos < 0 || len < 0)) 108 110 return -EINVAL; ··· 112 110 if (unlikely(check_add_overflow(pos, len, &tmp))) 113 111 return -EINVAL; 114 112 115 - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); 113 + ret = security_file_permission(file, mask); 114 + if (ret) 115 + return ret; 116 + 117 + return fsnotify_file_area_perm(file, mask, &pos, len); 116 118 } 117 119 118 120 /* ··· 391 385 if (!file_in->f_op->remap_file_range) 392 386 return -EOPNOTSUPP; 393 387 394 - ret = remap_verify_area(file_in, pos_in, len, false); 395 - if (ret) 396 - return ret; 397 - 398 - ret = remap_verify_area(file_out, pos_out, len, true); 399 - if (ret) 400 - return ret; 401 - 402 388 ret = file_in->f_op->remap_file_range(file_in, pos_in, 403 389 file_out, pos_out, len, remap_flags); 404 390 if (ret < 0) ··· 408 410 { 409 411 loff_t ret; 410 412 413 + ret = remap_verify_area(file_in, pos_in, len, false); 414 + if (ret) 415 + return ret; 416 + 417 + ret = remap_verify_area(file_out, pos_out, len, true); 418 + if (ret) 419 + return ret; 420 + 411 421 file_start_write(file_out); 412 422 ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, 413 423 remap_flags); ··· 426 420 EXPORT_SYMBOL(vfs_clone_file_range); 427 421 428 422 /* Check whether we are allowed to dedupe the destination file */ 429 - static bool allow_file_dedupe(struct file *file) 423 + static bool may_dedupe_file(struct file *file) 430 424 { 431 425 struct mnt_idmap *idmap = file_mnt_idmap(file); 432 426 struct inode *inode = file_inode(file); ··· 451 445 WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | 452 446 REMAP_FILE_CAN_SHORTEN)); 453 447 454 - ret = mnt_want_write_file(dst_file); 455 - if (ret) 456 - return ret; 457 - 458 448 /* 459 449 * This is redundant if called from vfs_dedupe_file_range(), but other 460 450 * callers need it and it's not performance sesitive... 461 451 */ 462 452 ret = remap_verify_area(src_file, src_pos, len, false); 463 453 if (ret) 464 - goto out_drop_write; 454 + return ret; 465 455 466 456 ret = remap_verify_area(dst_file, dst_pos, len, true); 467 457 if (ret) 468 - goto out_drop_write; 458 + return ret; 459 + 460 + /* 461 + * This needs to be called after remap_verify_area() because of 462 + * sb_start_write() and before may_dedupe_file() because the mount's 463 + * MAY_WRITE need to be checked with mnt_get_write_access_file() held. 464 + */ 465 + ret = mnt_want_write_file(dst_file); 466 + if (ret) 467 + return ret; 469 468 470 469 ret = -EPERM; 471 - if (!allow_file_dedupe(dst_file)) 470 + if (!may_dedupe_file(dst_file)) 472 471 goto out_drop_write; 473 472 474 473 ret = -EXDEV;
+3 -2
fs/smb/client/cifsfs.c
··· 25 25 #include <linux/freezer.h> 26 26 #include <linux/namei.h> 27 27 #include <linux/random.h> 28 + #include <linux/splice.h> 28 29 #include <linux/uuid.h> 29 30 #include <linux/xattr.h> 30 31 #include <uapi/linux/magic.h> ··· 1507 1506 free_xid(xid); 1508 1507 1509 1508 if (rc == -EOPNOTSUPP || rc == -EXDEV) 1510 - rc = generic_copy_file_range(src_file, off, dst_file, 1511 - destoff, len, flags); 1509 + rc = splice_copy_file_range(src_file, off, dst_file, 1510 + destoff, len); 1512 1511 return rc; 1513 1512 } 1514 1513
+163 -96
fs/splice.c
··· 201 201 unsigned int tail = pipe->tail; 202 202 unsigned int head = pipe->head; 203 203 unsigned int mask = pipe->ring_size - 1; 204 - int ret = 0, page_nr = 0; 204 + ssize_t ret = 0; 205 + int page_nr = 0; 205 206 206 207 if (!spd_pages) 207 208 return 0; ··· 674 673 .u.file = out, 675 674 }; 676 675 int nbufs = pipe->max_usage; 677 - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 678 - GFP_KERNEL); 676 + struct bio_vec *array; 679 677 ssize_t ret; 680 678 679 + if (!out->f_op->write_iter) 680 + return -EINVAL; 681 + 682 + array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); 681 683 if (unlikely(!array)) 682 684 return -ENOMEM; 683 685 ··· 688 684 689 685 splice_from_pipe_begin(&sd); 690 686 while (sd.total_len) { 687 + struct kiocb kiocb; 691 688 struct iov_iter from; 692 689 unsigned int head, tail, mask; 693 690 size_t left; ··· 738 733 } 739 734 740 735 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); 741 - ret = vfs_iter_write(out, &from, &sd.pos, 0); 736 + init_sync_kiocb(&kiocb, out); 737 + kiocb.ki_pos = sd.pos; 738 + ret = call_write_iter(out, &kiocb, &from); 739 + sd.pos = kiocb.ki_pos; 742 740 if (ret <= 0) 743 741 break; 744 742 ··· 933 925 /* 934 926 * Attempt to initiate a splice from pipe to file. 935 927 */ 936 - static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 937 - loff_t *ppos, size_t len, unsigned int flags) 928 + static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, 929 + loff_t *ppos, size_t len, unsigned int flags) 938 930 { 939 931 if (unlikely(!out->f_op->splice_write)) 940 932 return warn_unsupported(out, "write"); ··· 952 944 sd->splice_eof(sd); 953 945 } 954 946 947 + /* 948 + * Callers already called rw_verify_area() on the entire range. 949 + * No need to call it for sub ranges. 950 + */ 951 + static ssize_t do_splice_read(struct file *in, loff_t *ppos, 952 + struct pipe_inode_info *pipe, size_t len, 953 + unsigned int flags) 954 + { 955 + unsigned int p_space; 956 + 957 + if (unlikely(!(in->f_mode & FMODE_READ))) 958 + return -EBADF; 959 + if (!len) 960 + return 0; 961 + 962 + /* Don't try to read more the pipe has space for. */ 963 + p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); 964 + len = min_t(size_t, len, p_space << PAGE_SHIFT); 965 + 966 + if (unlikely(len > MAX_RW_COUNT)) 967 + len = MAX_RW_COUNT; 968 + 969 + if (unlikely(!in->f_op->splice_read)) 970 + return warn_unsupported(in, "read"); 971 + /* 972 + * O_DIRECT and DAX don't deal with the pagecache, so we allocate a 973 + * buffer, copy into it and splice that into the pipe. 974 + */ 975 + if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) 976 + return copy_splice_read(in, ppos, pipe, len, flags); 977 + return in->f_op->splice_read(in, ppos, pipe, len, flags); 978 + } 979 + 955 980 /** 956 981 * vfs_splice_read - Read data from a file and splice it into a pipe 957 982 * @in: File to splice from ··· 1000 959 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or 1001 960 * a hole and a negative error code otherwise. 1002 961 */ 1003 - long vfs_splice_read(struct file *in, loff_t *ppos, 1004 - struct pipe_inode_info *pipe, size_t len, 1005 - unsigned int flags) 962 + ssize_t vfs_splice_read(struct file *in, loff_t *ppos, 963 + struct pipe_inode_info *pipe, size_t len, 964 + unsigned int flags) 1006 965 { 1007 - unsigned int p_space; 1008 - int ret; 1009 - 1010 - if (unlikely(!(in->f_mode & FMODE_READ))) 1011 - return -EBADF; 1012 - if (!len) 1013 - return 0; 1014 - 1015 - /* Don't try to read more the pipe has space for. */ 1016 - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); 1017 - len = min_t(size_t, len, p_space << PAGE_SHIFT); 966 + ssize_t ret; 1018 967 1019 968 ret = rw_verify_area(READ, in, ppos, len); 1020 969 if (unlikely(ret < 0)) 1021 970 return ret; 1022 971 1023 - if (unlikely(len > MAX_RW_COUNT)) 1024 - len = MAX_RW_COUNT; 1025 - 1026 - if (unlikely(!in->f_op->splice_read)) 1027 - return warn_unsupported(in, "read"); 1028 - /* 1029 - * O_DIRECT and DAX don't deal with the pagecache, so we allocate a 1030 - * buffer, copy into it and splice that into the pipe. 1031 - */ 1032 - if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) 1033 - return copy_splice_read(in, ppos, pipe, len, flags); 1034 - return in->f_op->splice_read(in, ppos, pipe, len, flags); 972 + return do_splice_read(in, ppos, pipe, len, flags); 1035 973 } 1036 974 EXPORT_SYMBOL_GPL(vfs_splice_read); 1037 975 ··· 1031 1011 splice_direct_actor *actor) 1032 1012 { 1033 1013 struct pipe_inode_info *pipe; 1034 - long ret, bytes; 1014 + ssize_t ret, bytes; 1035 1015 size_t len; 1036 1016 int i, flags, more; 1037 1017 ··· 1086 1066 size_t read_len; 1087 1067 loff_t pos = sd->pos, prev_pos = pos; 1088 1068 1089 - ret = vfs_splice_read(in, &pos, pipe, len, flags); 1069 + ret = do_splice_read(in, &pos, pipe, len, flags); 1090 1070 if (unlikely(ret <= 0)) 1091 1071 goto read_failure; 1092 1072 ··· 1158 1138 struct splice_desc *sd) 1159 1139 { 1160 1140 struct file *file = sd->u.file; 1141 + long ret; 1161 1142 1162 - return do_splice_from(pipe, file, sd->opos, sd->total_len, 1163 - sd->flags); 1143 + file_start_write(file); 1144 + ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); 1145 + file_end_write(file); 1146 + return ret; 1147 + } 1148 + 1149 + static int splice_file_range_actor(struct pipe_inode_info *pipe, 1150 + struct splice_desc *sd) 1151 + { 1152 + struct file *file = sd->u.file; 1153 + 1154 + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); 1164 1155 } 1165 1156 1166 1157 static void direct_file_splice_eof(struct splice_desc *sd) ··· 1182 1151 file->f_op->splice_eof(file); 1183 1152 } 1184 1153 1154 + static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, 1155 + struct file *out, loff_t *opos, 1156 + size_t len, unsigned int flags, 1157 + splice_direct_actor *actor) 1158 + { 1159 + struct splice_desc sd = { 1160 + .len = len, 1161 + .total_len = len, 1162 + .flags = flags, 1163 + .pos = *ppos, 1164 + .u.file = out, 1165 + .splice_eof = direct_file_splice_eof, 1166 + .opos = opos, 1167 + }; 1168 + ssize_t ret; 1169 + 1170 + if (unlikely(!(out->f_mode & FMODE_WRITE))) 1171 + return -EBADF; 1172 + 1173 + if (unlikely(out->f_flags & O_APPEND)) 1174 + return -EINVAL; 1175 + 1176 + ret = splice_direct_to_actor(in, &sd, actor); 1177 + if (ret > 0) 1178 + *ppos = sd.pos; 1179 + 1180 + return ret; 1181 + } 1185 1182 /** 1186 1183 * do_splice_direct - splices data directly between two files 1187 1184 * @in: file to splice from ··· 1225 1166 * (splice in + splice out, as compared to just sendfile()). So this helper 1226 1167 * can splice directly through a process-private pipe. 1227 1168 * 1169 + * Callers already called rw_verify_area() on the entire range. 1228 1170 */ 1229 - long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1230 - loff_t *opos, size_t len, unsigned int flags) 1171 + ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1172 + loff_t *opos, size_t len, unsigned int flags) 1231 1173 { 1232 - struct splice_desc sd = { 1233 - .len = len, 1234 - .total_len = len, 1235 - .flags = flags, 1236 - .pos = *ppos, 1237 - .u.file = out, 1238 - .splice_eof = direct_file_splice_eof, 1239 - .opos = opos, 1240 - }; 1241 - long ret; 1242 - 1243 - if (unlikely(!(out->f_mode & FMODE_WRITE))) 1244 - return -EBADF; 1245 - 1246 - if (unlikely(out->f_flags & O_APPEND)) 1247 - return -EINVAL; 1248 - 1249 - ret = rw_verify_area(WRITE, out, opos, len); 1250 - if (unlikely(ret < 0)) 1251 - return ret; 1252 - 1253 - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1254 - if (ret > 0) 1255 - *ppos = sd.pos; 1256 - 1257 - return ret; 1174 + return do_splice_direct_actor(in, ppos, out, opos, len, flags, 1175 + direct_splice_actor); 1258 1176 } 1259 1177 EXPORT_SYMBOL(do_splice_direct); 1178 + 1179 + /** 1180 + * splice_file_range - splices data between two files for copy_file_range() 1181 + * @in: file to splice from 1182 + * @ppos: input file offset 1183 + * @out: file to splice to 1184 + * @opos: output file offset 1185 + * @len: number of bytes to splice 1186 + * 1187 + * Description: 1188 + * For use by ->copy_file_range() methods. 1189 + * Like do_splice_direct(), but vfs_copy_file_range() already holds 1190 + * start_file_write() on @out file. 1191 + * 1192 + * Callers already called rw_verify_area() on the entire range. 1193 + */ 1194 + ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, 1195 + loff_t *opos, size_t len) 1196 + { 1197 + lockdep_assert(file_write_started(out)); 1198 + 1199 + return do_splice_direct_actor(in, ppos, out, opos, 1200 + min_t(size_t, len, MAX_RW_COUNT), 1201 + 0, splice_file_range_actor); 1202 + } 1203 + EXPORT_SYMBOL(splice_file_range); 1260 1204 1261 1205 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1262 1206 { ··· 1282 1220 struct pipe_inode_info *opipe, 1283 1221 size_t len, unsigned int flags); 1284 1222 1285 - long splice_file_to_pipe(struct file *in, 1286 - struct pipe_inode_info *opipe, 1287 - loff_t *offset, 1288 - size_t len, unsigned int flags) 1223 + ssize_t splice_file_to_pipe(struct file *in, 1224 + struct pipe_inode_info *opipe, 1225 + loff_t *offset, 1226 + size_t len, unsigned int flags) 1289 1227 { 1290 - long ret; 1228 + ssize_t ret; 1291 1229 1292 1230 pipe_lock(opipe); 1293 1231 ret = wait_for_space(opipe, flags); 1294 1232 if (!ret) 1295 - ret = vfs_splice_read(in, offset, opipe, len, flags); 1233 + ret = do_splice_read(in, offset, opipe, len, flags); 1296 1234 pipe_unlock(opipe); 1297 1235 if (ret > 0) 1298 1236 wakeup_pipe_readers(opipe); ··· 1302 1240 /* 1303 1241 * Determine where to splice to/from. 1304 1242 */ 1305 - long do_splice(struct file *in, loff_t *off_in, struct file *out, 1306 - loff_t *off_out, size_t len, unsigned int flags) 1243 + ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, 1244 + loff_t *off_out, size_t len, unsigned int flags) 1307 1245 { 1308 1246 struct pipe_inode_info *ipipe; 1309 1247 struct pipe_inode_info *opipe; 1310 1248 loff_t offset; 1311 - long ret; 1249 + ssize_t ret; 1312 1250 1313 1251 if (unlikely(!(in->f_mode & FMODE_READ) || 1314 1252 !(out->f_mode & FMODE_WRITE))) ··· 1369 1307 offset = in->f_pos; 1370 1308 } 1371 1309 1310 + ret = rw_verify_area(READ, in, &offset, len); 1311 + if (unlikely(ret < 0)) 1312 + return ret; 1313 + 1372 1314 if (out->f_flags & O_NONBLOCK) 1373 1315 flags |= SPLICE_F_NONBLOCK; 1374 1316 ··· 1399 1333 return ret; 1400 1334 } 1401 1335 1402 - static long __do_splice(struct file *in, loff_t __user *off_in, 1403 - struct file *out, loff_t __user *off_out, 1404 - size_t len, unsigned int flags) 1336 + static ssize_t __do_splice(struct file *in, loff_t __user *off_in, 1337 + struct file *out, loff_t __user *off_out, 1338 + size_t len, unsigned int flags) 1405 1339 { 1406 1340 struct pipe_inode_info *ipipe; 1407 1341 struct pipe_inode_info *opipe; 1408 1342 loff_t offset, *__off_in = NULL, *__off_out = NULL; 1409 - long ret; 1343 + ssize_t ret; 1410 1344 1411 1345 ipipe = get_pipe_info(in, true); 1412 1346 opipe = get_pipe_info(out, true); ··· 1445 1379 return ret; 1446 1380 } 1447 1381 1448 - static int iter_to_pipe(struct iov_iter *from, 1449 - struct pipe_inode_info *pipe, 1450 - unsigned flags) 1382 + static ssize_t iter_to_pipe(struct iov_iter *from, 1383 + struct pipe_inode_info *pipe, 1384 + unsigned int flags) 1451 1385 { 1452 1386 struct pipe_buffer buf = { 1453 1387 .ops = &user_page_pipe_buf_ops, 1454 1388 .flags = flags 1455 1389 }; 1456 1390 size_t total = 0; 1457 - int ret = 0; 1391 + ssize_t ret = 0; 1458 1392 1459 1393 while (iov_iter_count(from)) { 1460 1394 struct page *pages[16]; ··· 1503 1437 * For lack of a better implementation, implement vmsplice() to userspace 1504 1438 * as a simple copy of the pipes pages to the user iov. 1505 1439 */ 1506 - static long vmsplice_to_user(struct file *file, struct iov_iter *iter, 1507 - unsigned int flags) 1440 + static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, 1441 + unsigned int flags) 1508 1442 { 1509 1443 struct pipe_inode_info *pipe = get_pipe_info(file, true); 1510 1444 struct splice_desc sd = { ··· 1512 1446 .flags = flags, 1513 1447 .u.data = iter 1514 1448 }; 1515 - long ret = 0; 1449 + ssize_t ret = 0; 1516 1450 1517 1451 if (!pipe) 1518 1452 return -EBADF; ··· 1536 1470 * as splice-from-memory, where the regular splice is splice-from-file (or 1537 1471 * to file). In both cases the output is a pipe, naturally. 1538 1472 */ 1539 - static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1540 - unsigned int flags) 1473 + static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1474 + unsigned int flags) 1541 1475 { 1542 1476 struct pipe_inode_info *pipe; 1543 - long ret = 0; 1477 + ssize_t ret = 0; 1544 1478 unsigned buf_flag = 0; 1545 1479 1546 1480 if (flags & SPLICE_F_GIFT) ··· 1636 1570 size_t, len, unsigned int, flags) 1637 1571 { 1638 1572 struct fd in, out; 1639 - long error; 1573 + ssize_t error; 1640 1574 1641 1575 if (unlikely(!len)) 1642 1576 return 0; ··· 1650 1584 out = fdget(fd_out); 1651 1585 if (out.file) { 1652 1586 error = __do_splice(in.file, off_in, out.file, off_out, 1653 - len, flags); 1587 + len, flags); 1654 1588 fdput(out); 1655 1589 } 1656 1590 fdput(in); ··· 1873 1807 /* 1874 1808 * Link contents of ipipe to opipe. 1875 1809 */ 1876 - static int link_pipe(struct pipe_inode_info *ipipe, 1877 - struct pipe_inode_info *opipe, 1878 - size_t len, unsigned int flags) 1810 + static ssize_t link_pipe(struct pipe_inode_info *ipipe, 1811 + struct pipe_inode_info *opipe, 1812 + size_t len, unsigned int flags) 1879 1813 { 1880 1814 struct pipe_buffer *ibuf, *obuf; 1881 1815 unsigned int i_head, o_head; 1882 1816 unsigned int i_tail, o_tail; 1883 1817 unsigned int i_mask, o_mask; 1884 - int ret = 0; 1818 + ssize_t ret = 0; 1885 1819 1886 1820 /* 1887 1821 * Potential ABBA deadlock, work around it by ordering lock ··· 1964 1898 * The 'flags' used are the SPLICE_F_* variants, currently the only 1965 1899 * applicable one is SPLICE_F_NONBLOCK. 1966 1900 */ 1967 - long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) 1901 + ssize_t do_tee(struct file *in, struct file *out, size_t len, 1902 + unsigned int flags) 1968 1903 { 1969 1904 struct pipe_inode_info *ipipe = get_pipe_info(in, true); 1970 1905 struct pipe_inode_info *opipe = get_pipe_info(out, true); 1971 - int ret = -EINVAL; 1906 + ssize_t ret = -EINVAL; 1972 1907 1973 1908 if (unlikely(!(in->f_mode & FMODE_READ) || 1974 1909 !(out->f_mode & FMODE_WRITE))) ··· 2006 1939 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 2007 1940 { 2008 1941 struct fd in, out; 2009 - int error; 1942 + ssize_t error; 2010 1943 2011 1944 if (unlikely(flags & ~SPLICE_F_ALL)) 2012 1945 return -EINVAL;
+42
include/linux/backing-file.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Common helpers for stackable filesystems and backing files. 4 + * 5 + * Copyright (C) 2023 CTERA Networks. 6 + */ 7 + 8 + #ifndef _LINUX_BACKING_FILE_H 9 + #define _LINUX_BACKING_FILE_H 10 + 11 + #include <linux/file.h> 12 + #include <linux/uio.h> 13 + #include <linux/fs.h> 14 + 15 + struct backing_file_ctx { 16 + const struct cred *cred; 17 + struct file *user_file; 18 + void (*accessed)(struct file *); 19 + void (*end_write)(struct file *); 20 + }; 21 + 22 + struct file *backing_file_open(const struct path *user_path, int flags, 23 + const struct path *real_path, 24 + const struct cred *cred); 25 + ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, 26 + struct kiocb *iocb, int flags, 27 + struct backing_file_ctx *ctx); 28 + ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, 29 + struct kiocb *iocb, int flags, 30 + struct backing_file_ctx *ctx); 31 + ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, 32 + struct pipe_inode_info *pipe, size_t len, 33 + unsigned int flags, 34 + struct backing_file_ctx *ctx); 35 + ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, 36 + struct file *out, loff_t *ppos, size_t len, 37 + unsigned int flags, 38 + struct backing_file_ctx *ctx); 39 + int backing_file_mmap(struct file *file, struct vm_area_struct *vma, 40 + struct backing_file_ctx *ctx); 41 + 42 + #endif /* _LINUX_BACKING_FILE_H */
+62 -9
include/linux/fs.h
··· 1648 1648 #define __sb_writers_release(sb, lev) \ 1649 1649 percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) 1650 1650 1651 + /** 1652 + * __sb_write_started - check if sb freeze level is held 1653 + * @sb: the super we write to 1654 + * @level: the freeze level 1655 + * 1656 + * * > 0 - sb freeze level is held 1657 + * * 0 - sb freeze level is not held 1658 + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN 1659 + */ 1660 + static inline int __sb_write_started(const struct super_block *sb, int level) 1661 + { 1662 + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); 1663 + } 1664 + 1665 + /** 1666 + * sb_write_started - check if SB_FREEZE_WRITE is held 1667 + * @sb: the super we write to 1668 + * 1669 + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. 1670 + */ 1651 1671 static inline bool sb_write_started(const struct super_block *sb) 1652 1672 { 1653 - return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); 1673 + return __sb_write_started(sb, SB_FREEZE_WRITE); 1674 + } 1675 + 1676 + /** 1677 + * sb_write_not_started - check if SB_FREEZE_WRITE is not held 1678 + * @sb: the super we write to 1679 + * 1680 + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. 1681 + */ 1682 + static inline bool sb_write_not_started(const struct super_block *sb) 1683 + { 1684 + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; 1685 + } 1686 + 1687 + /** 1688 + * file_write_started - check if SB_FREEZE_WRITE is held 1689 + * @file: the file we write to 1690 + * 1691 + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. 1692 + * May be false positive with !S_ISREG, because file_start_write() has 1693 + * no effect on !S_ISREG. 1694 + */ 1695 + static inline bool file_write_started(const struct file *file) 1696 + { 1697 + if (!S_ISREG(file_inode(file)->i_mode)) 1698 + return true; 1699 + return sb_write_started(file_inode(file)->i_sb); 1700 + } 1701 + 1702 + /** 1703 + * file_write_not_started - check if SB_FREEZE_WRITE is not held 1704 + * @file: the file we write to 1705 + * 1706 + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. 1707 + * May be false positive with !S_ISREG, because file_start_write() has 1708 + * no effect on !S_ISREG. 1709 + */ 1710 + static inline bool file_write_not_started(const struct file *file) 1711 + { 1712 + if (!S_ISREG(file_inode(file)->i_mode)) 1713 + return true; 1714 + return sb_write_not_started(file_inode(file)->i_sb); 1654 1715 } 1655 1716 1656 1717 /** ··· 2093 2032 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 2094 2033 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, 2095 2034 loff_t, size_t, unsigned int); 2096 - extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, 2097 - struct file *file_out, loff_t pos_out, 2098 - size_t len, unsigned int flags); 2099 2035 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, 2100 2036 struct file *file_out, loff_t pos_out, 2101 2037 loff_t *len, unsigned int remap_flags, ··· 2593 2535 const struct cred *creds); 2594 2536 struct file *dentry_create(const struct path *path, int flags, umode_t mode, 2595 2537 const struct cred *cred); 2596 - struct file *backing_file_open(const struct path *user_path, int flags, 2597 - const struct path *real_path, 2598 - const struct cred *cred); 2599 2538 struct path *backing_file_user_path(struct file *f); 2600 2539 2601 2540 /* ··· 3072 3017 size_t len, unsigned int flags); 3073 3018 extern ssize_t iter_file_splice_write(struct pipe_inode_info *, 3074 3019 struct file *, loff_t *, size_t, unsigned int); 3075 - extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 3076 - loff_t *opos, size_t len, unsigned int flags); 3077 3020 3078 3021 3079 3022 extern void
+35 -15
include/linux/fsnotify.h
··· 100 100 return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); 101 101 } 102 102 103 - /* Simple call site for access decisions */ 104 - static inline int fsnotify_perm(struct file *file, int mask) 103 + /* 104 + * fsnotify_file_area_perm - permission hook before access to file range 105 + */ 106 + static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, 107 + const loff_t *ppos, size_t count) 105 108 { 106 - int ret; 107 - __u32 fsnotify_mask = 0; 109 + __u32 fsnotify_mask = FS_ACCESS_PERM; 108 110 109 - if (!(mask & (MAY_READ | MAY_OPEN))) 111 + /* 112 + * filesystem may be modified in the context of permission events 113 + * (e.g. by HSM filling a file on access), so sb freeze protection 114 + * must not be held. 115 + */ 116 + lockdep_assert_once(file_write_not_started(file)); 117 + 118 + if (!(perm_mask & MAY_READ)) 110 119 return 0; 111 120 112 - if (mask & MAY_OPEN) { 113 - fsnotify_mask = FS_OPEN_PERM; 121 + return fsnotify_file(file, fsnotify_mask); 122 + } 114 123 115 - if (file->f_flags & __FMODE_EXEC) { 116 - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); 124 + /* 125 + * fsnotify_file_perm - permission hook before file access 126 + */ 127 + static inline int fsnotify_file_perm(struct file *file, int perm_mask) 128 + { 129 + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); 130 + } 117 131 118 - if (ret) 119 - return ret; 120 - } 121 - } else if (mask & MAY_READ) { 122 - fsnotify_mask = FS_ACCESS_PERM; 132 + /* 133 + * fsnotify_open_perm - permission hook before file open 134 + */ 135 + static inline int fsnotify_open_perm(struct file *file) 136 + { 137 + int ret; 138 + 139 + if (file->f_flags & __FMODE_EXEC) { 140 + ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); 141 + if (ret) 142 + return ret; 123 143 } 124 144 125 - return fsnotify_file(file, fsnotify_mask); 145 + return fsnotify_file(file, FS_OPEN_PERM); 126 146 } 127 147 128 148 /*
+30 -21
include/linux/splice.h
··· 68 68 typedef int (splice_direct_actor)(struct pipe_inode_info *, 69 69 struct splice_desc *); 70 70 71 - extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, 72 - loff_t *, size_t, unsigned int, 73 - splice_actor *); 74 - extern ssize_t __splice_from_pipe(struct pipe_inode_info *, 75 - struct splice_desc *, splice_actor *); 76 - extern ssize_t splice_to_pipe(struct pipe_inode_info *, 77 - struct splice_pipe_desc *); 78 - extern ssize_t add_to_pipe(struct pipe_inode_info *, 79 - struct pipe_buffer *); 80 - long vfs_splice_read(struct file *in, loff_t *ppos, 81 - struct pipe_inode_info *pipe, size_t len, 82 - unsigned int flags); 83 - extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, 84 - splice_direct_actor *); 85 - extern long do_splice(struct file *in, loff_t *off_in, 86 - struct file *out, loff_t *off_out, 87 - size_t len, unsigned int flags); 71 + ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 72 + loff_t *ppos, size_t len, unsigned int flags, 73 + splice_actor *actor); 74 + ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, 75 + struct splice_desc *sd, splice_actor *actor); 76 + ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 77 + struct splice_pipe_desc *spd); 78 + ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf); 79 + ssize_t vfs_splice_read(struct file *in, loff_t *ppos, 80 + struct pipe_inode_info *pipe, size_t len, 81 + unsigned int flags); 82 + ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, 83 + splice_direct_actor *actor); 84 + ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, 85 + loff_t *off_out, size_t len, unsigned int flags); 86 + ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 87 + loff_t *opos, size_t len, unsigned int flags); 88 + ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, 89 + loff_t *opos, size_t len); 88 90 89 - extern long do_tee(struct file *in, struct file *out, size_t len, 90 - unsigned int flags); 91 - extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, 92 - loff_t *ppos, size_t len, unsigned int flags); 91 + static inline long splice_copy_file_range(struct file *in, loff_t pos_in, 92 + struct file *out, loff_t pos_out, 93 + size_t len) 94 + { 95 + return splice_file_range(in, &pos_in, out, &pos_out, len); 96 + } 97 + 98 + ssize_t do_tee(struct file *in, struct file *out, size_t len, 99 + unsigned int flags); 100 + ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, 101 + loff_t *ppos, size_t len, unsigned int flags); 93 102 94 103 /* 95 104 * for dynamic pipe sizing
+2 -2
io_uring/splice.c
··· 51 51 struct file *out = sp->file_out; 52 52 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 53 53 struct file *in; 54 - long ret = 0; 54 + ssize_t ret = 0; 55 55 56 56 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 57 57 ··· 92 92 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 93 93 loff_t *poff_in, *poff_out; 94 94 struct file *in; 95 - long ret = 0; 95 + ssize_t ret = 0; 96 96 97 97 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 98 98
+2 -8
security/security.c
··· 2580 2580 */ 2581 2581 int security_file_permission(struct file *file, int mask) 2582 2582 { 2583 - int ret; 2584 - 2585 - ret = call_int_hook(file_permission, 0, file, mask); 2586 - if (ret) 2587 - return ret; 2588 - 2589 - return fsnotify_perm(file, mask); 2583 + return call_int_hook(file_permission, 0, file, mask); 2590 2584 } 2591 2585 2592 2586 /** ··· 2831 2837 if (ret) 2832 2838 return ret; 2833 2839 2834 - return fsnotify_perm(file, MAY_OPEN); 2840 + return fsnotify_open_perm(file); 2835 2841 } 2836 2842 2837 2843 /**