Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse

Pull fuse updates from Miklos Szeredi:

- Fix an issue with reusing the bdi in case of block based filesystems

- Allow root (in init namespace) to access fuse filesystems in user
namespaces if expicitly enabled with a module param

- Misc fixes

* tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
fuse: retire block-device-based superblock on force unmount
vfs: function to prevent re-use of block-device-based superblocks
virtio_fs: Modify format for virtio_fs_direct_access
virtiofs: delete unused parameter for virtio_fs_cleanup_vqs
fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other
fuse: Remove the control interface for virtio-fs
fuse: ioctl: translate ENOSYS
fuse: limit nsec
fuse: avoid unnecessary spinlock bump
fuse: fix deadlock between atomic O_TRUNC and page invalidation
fuse: write inode in fuse_release()

+132 -33
+24 -5
Documentation/filesystems/fuse.rst
··· 279 279 the filesystem or not. 280 280 281 281 Note that the *ptrace* check is not strictly necessary to 282 - prevent B/2/i, it is enough to check if mount owner has enough 282 + prevent C/2/i, it is enough to check if mount owner has enough 283 283 privilege to send signal to the process accessing the 284 284 filesystem, since *SIGSTOP* can be used to get a similar effect. 285 285 ··· 288 288 289 289 If a sysadmin trusts the users enough, or can ensure through other 290 290 measures, that system processes will never enter non-privileged 291 - mounts, it can relax the last limitation with a 'user_allow_other' 292 - config option. If this config option is set, the mounting user can 293 - add the 'allow_other' mount option which disables the check for other 294 - users' processes. 291 + mounts, it can relax the last limitation in several ways: 292 + 293 + - With the 'user_allow_other' config option. If this config option is 294 + set, the mounting user can add the 'allow_other' mount option which 295 + disables the check for other users' processes. 296 + 297 + User namespaces have an unintuitive interaction with 'allow_other': 298 + an unprivileged user - normally restricted from mounting with 299 + 'allow_other' - could do so in a user namespace where they're 300 + privileged. If any process could access such an 'allow_other' mount 301 + this would give the mounting user the ability to manipulate 302 + processes in user namespaces where they're unprivileged. For this 303 + reason 'allow_other' restricts access to users in the same userns 304 + or a descendant. 305 + 306 + - With the 'allow_sys_admin_access' module option. If this option is 307 + set, super user's processes have unrestricted access to mounts 308 + irrespective of allow_other setting or user namespace of the 309 + mounting user. 310 + 311 + Note that both of these relaxations expose the system to potential 312 + information leak or *DoS* as described in points B and C/2/i-ii in the 313 + preceding section. 295 314 296 315 Kernel - userspace interface 297 316 ============================
+2 -2
fs/fuse/control.c
··· 258 258 struct dentry *parent; 259 259 char name[32]; 260 260 261 - if (!fuse_control_sb) 261 + if (!fuse_control_sb || fc->no_control) 262 262 return 0; 263 263 264 264 parent = fuse_control_sb->s_root; ··· 296 296 { 297 297 int i; 298 298 299 - if (!fuse_control_sb) 299 + if (!fuse_control_sb || fc->no_control) 300 300 return; 301 301 302 302 for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+1 -1
fs/fuse/dax.c
··· 138 138 WARN_ON(fcd->nr_free_ranges <= 0); 139 139 fcd->nr_free_ranges--; 140 140 } 141 + __kick_dmap_free_worker(fcd, 0); 141 142 spin_unlock(&fcd->lock); 142 143 143 - kick_dmap_free_worker(fcd, 0); 144 144 return dmap; 145 145 } 146 146
+15 -1
fs/fuse/dir.c
··· 11 11 #include <linux/pagemap.h> 12 12 #include <linux/file.h> 13 13 #include <linux/fs_context.h> 14 + #include <linux/moduleparam.h> 14 15 #include <linux/sched.h> 15 16 #include <linux/namei.h> 16 17 #include <linux/slab.h> ··· 21 20 #include <linux/security.h> 22 21 #include <linux/types.h> 23 22 #include <linux/kernel.h> 23 + 24 + static bool __read_mostly allow_sys_admin_access; 25 + module_param(allow_sys_admin_access, bool, 0644); 26 + MODULE_PARM_DESC(allow_sys_admin_access, 27 + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); 24 28 25 29 static void fuse_advise_use_readdirplus(struct inode *dir) 26 30 { ··· 543 537 struct fuse_file *ff; 544 538 void *security_ctx = NULL; 545 539 u32 security_ctxlen; 540 + bool trunc = flags & O_TRUNC; 546 541 547 542 /* Userspace expects S_IFREG in create mode */ 548 543 BUG_ON((mode & S_IFMT) != S_IFREG); ··· 568 561 inarg.mode = mode; 569 562 inarg.umask = current_umask(); 570 563 571 - if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) && 564 + if (fm->fc->handle_killpriv_v2 && trunc && 572 565 !(flags & O_EXCL) && !capable(CAP_FSETID)) { 573 566 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; 574 567 } ··· 630 623 } else { 631 624 file->private_data = ff; 632 625 fuse_finish_open(inode, file); 626 + if (fm->fc->atomic_o_trunc && trunc) 627 + truncate_pagecache(inode, 0); 628 + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 629 + invalidate_inode_pages2(inode->i_mapping); 633 630 } 634 631 return err; 635 632 ··· 1234 1223 int fuse_allow_current_process(struct fuse_conn *fc) 1235 1224 { 1236 1225 const struct cred *cred; 1226 + 1227 + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) 1228 + return 1; 1237 1229 1238 1230 if (fc->allow_other) 1239 1231 return current_in_userns(fc->user_ns);
+26 -13
fs/fuse/file.c
··· 210 210 fi->attr_version = atomic64_inc_return(&fc->attr_version); 211 211 i_size_write(inode, 0); 212 212 spin_unlock(&fi->lock); 213 - truncate_pagecache(inode, 0); 214 213 file_update_time(file); 215 214 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); 216 - } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { 217 - invalidate_inode_pages2(inode->i_mapping); 218 215 } 219 - 220 216 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) 221 217 fuse_link_write_file(file); 222 218 } ··· 235 239 if (err) 236 240 return err; 237 241 238 - if (is_wb_truncate || dax_truncate) { 242 + if (is_wb_truncate || dax_truncate) 239 243 inode_lock(inode); 240 - fuse_set_nowrite(inode); 241 - } 242 244 243 245 if (dax_truncate) { 244 246 filemap_invalidate_lock(inode->i_mapping); 245 247 err = fuse_dax_break_layouts(inode, 0, 0); 246 248 if (err) 247 - goto out; 249 + goto out_inode_unlock; 248 250 } 251 + 252 + if (is_wb_truncate || dax_truncate) 253 + fuse_set_nowrite(inode); 249 254 250 255 err = fuse_do_open(fm, get_node_id(inode), file, isdir); 251 256 if (!err) 252 257 fuse_finish_open(inode, file); 253 258 254 - out: 259 + if (is_wb_truncate || dax_truncate) 260 + fuse_release_nowrite(inode); 261 + if (!err) { 262 + struct fuse_file *ff = file->private_data; 263 + 264 + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) 265 + truncate_pagecache(inode, 0); 266 + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 267 + invalidate_inode_pages2(inode->i_mapping); 268 + } 255 269 if (dax_truncate) 256 270 filemap_invalidate_unlock(inode->i_mapping); 257 - 258 - if (is_wb_truncate | dax_truncate) { 259 - fuse_release_nowrite(inode); 271 + out_inode_unlock: 272 + if (is_wb_truncate || dax_truncate) 260 273 inode_unlock(inode); 261 - } 262 274 263 275 return err; 264 276 } ··· 342 338 343 339 static int fuse_release(struct inode *inode, struct file *file) 344 340 { 341 + struct fuse_conn *fc = get_fuse_conn(inode); 342 + 343 + /* 344 + * Dirty pages might remain despite write_inode_now() call from 345 + * fuse_flush() due to writes racing with the close. 346 + */ 347 + if (fc->writeback_cache) 348 + write_inode_now(inode, 1); 349 + 345 350 fuse_release_common(file, false); 346 351 347 352 /* return value is ignored by VFS */
+14 -2
fs/fuse/inode.c
··· 180 180 inode->i_uid = make_kuid(fc->user_ns, attr->uid); 181 181 inode->i_gid = make_kgid(fc->user_ns, attr->gid); 182 182 inode->i_blocks = attr->blocks; 183 + 184 + /* Sanitize nsecs */ 185 + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); 186 + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); 187 + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); 188 + 183 189 inode->i_atime.tv_sec = attr->atime; 184 190 inode->i_atime.tv_nsec = attr->atimensec; 185 191 /* mtime from server may be stale due to local buffered write */ ··· 482 476 { 483 477 struct fuse_conn *fc = get_fuse_conn_super(sb); 484 478 485 - if (!fc->no_force_umount) 486 - fuse_abort_conn(fc); 479 + if (fc->no_force_umount) 480 + return; 481 + 482 + fuse_abort_conn(fc); 483 + 484 + // Only retire block-device-based superblocks. 485 + if (sb->s_bdev != NULL) 486 + retire_super(sb); 487 487 } 488 488 489 489 static void fuse_send_destroy(struct fuse_mount *fm)
+13 -2
fs/fuse/ioctl.c
··· 9 9 #include <linux/compat.h> 10 10 #include <linux/fileattr.h> 11 11 12 + static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) 13 + { 14 + ssize_t ret = fuse_simple_request(fm, args); 15 + 16 + /* Translate ENOSYS, which shouldn't be returned from fs */ 17 + if (ret == -ENOSYS) 18 + ret = -ENOTTY; 19 + 20 + return ret; 21 + } 22 + 12 23 /* 13 24 * CUSE servers compiled on 32bit broke on 64bit kernels because the 14 25 * ABI was defined to be 'struct iovec' which is different on 32bit ··· 270 259 ap.args.out_pages = true; 271 260 ap.args.out_argvar = true; 272 261 273 - transferred = fuse_simple_request(fm, &ap.args); 262 + transferred = fuse_send_ioctl(fm, &ap.args); 274 263 err = transferred; 275 264 if (transferred < 0) 276 265 goto out; ··· 404 393 args.out_args[1].size = inarg.out_size; 405 394 args.out_args[1].value = ptr; 406 395 407 - err = fuse_simple_request(fm, &args); 396 + err = fuse_send_ioctl(fm, &args); 408 397 if (!err) { 409 398 if (outarg.result < 0) 410 399 err = outarg.result;
+4 -5
fs/fuse/virtio_fs.c
··· 741 741 } 742 742 743 743 /* Free virtqueues (device must already be reset) */ 744 - static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, 745 - struct virtio_fs *fs) 744 + static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) 746 745 { 747 746 vdev->config->del_vqs(vdev); 748 747 } ··· 756 757 { 757 758 struct virtio_fs *fs = dax_get_private(dax_dev); 758 759 phys_addr_t offset = PFN_PHYS(pgoff); 759 - size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; 760 + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; 760 761 761 762 if (kaddr) 762 763 *kaddr = fs->window_kaddr + offset; ··· 894 895 895 896 out_vqs: 896 897 virtio_reset_device(vdev); 897 - virtio_fs_cleanup_vqs(vdev, fs); 898 + virtio_fs_cleanup_vqs(vdev); 898 899 kfree(fs->vqs); 899 900 900 901 out: ··· 926 927 virtio_fs_stop_all_queues(fs); 927 928 virtio_fs_drain_all_queues_locked(fs); 928 929 virtio_reset_device(vdev); 929 - virtio_fs_cleanup_vqs(vdev, fs); 930 + virtio_fs_cleanup_vqs(vdev); 930 931 931 932 vdev->priv = NULL; 932 933 /* Put device reference on virtio_fs object */
+31 -2
fs/super.c
··· 423 423 } 424 424 425 425 /** 426 + * retire_super - prevents superblock from being reused 427 + * @sb: superblock to retire 428 + * 429 + * The function marks superblock to be ignored in superblock test, which 430 + * prevents it from being reused for any new mounts. If the superblock has 431 + * a private bdi, it also unregisters it, but doesn't reduce the refcount 432 + * of the superblock to prevent potential races. The refcount is reduced 433 + * by generic_shutdown_super(). The function can not be called 434 + * concurrently with generic_shutdown_super(). It is safe to call the 435 + * function multiple times, subsequent calls have no effect. 436 + * 437 + * The marker will affect the re-use only for block-device-based 438 + * superblocks. Other superblocks will still get marked if this function 439 + * is used, but that will not affect their reusability. 440 + */ 441 + void retire_super(struct super_block *sb) 442 + { 443 + WARN_ON(!sb->s_bdev); 444 + down_write(&sb->s_umount); 445 + if (sb->s_iflags & SB_I_PERSB_BDI) { 446 + bdi_unregister(sb->s_bdi); 447 + sb->s_iflags &= ~SB_I_PERSB_BDI; 448 + } 449 + sb->s_iflags |= SB_I_RETIRED; 450 + up_write(&sb->s_umount); 451 + } 452 + EXPORT_SYMBOL(retire_super); 453 + 454 + /** 426 455 * generic_shutdown_super - common helper for ->kill_sb() 427 456 * @sb: superblock to kill 428 457 * ··· 1245 1216 1246 1217 static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) 1247 1218 { 1248 - return s->s_bdev == fc->sget_key; 1219 + return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; 1249 1220 } 1250 1221 1251 1222 /** ··· 1338 1309 1339 1310 static int test_bdev_super(struct super_block *s, void *data) 1340 1311 { 1341 - return (void *)s->s_bdev == data; 1312 + return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; 1342 1313 } 1343 1314 1344 1315 struct dentry *mount_bdev(struct file_system_type *fs_type,
+2
include/linux/fs.h
··· 1433 1433 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ 1434 1434 #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ 1435 1435 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ 1436 + #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ 1436 1437 1437 1438 /* Possible states of 'frozen' field */ 1438 1439 enum { ··· 2566 2565 int flags, void *data, 2567 2566 int (*fill_super)(struct super_block *, void *, int)); 2568 2567 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); 2568 + void retire_super(struct super_block *sb); 2569 2569 void generic_shutdown_super(struct super_block *sb); 2570 2570 void kill_block_super(struct super_block *sb); 2571 2571 void kill_anon_super(struct super_block *sb);