Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull second vfs pile from Al Viro:
"The stuff in there: fsfreeze deadlock fixes by Jan (essentially, the
deadlock reproduced by xfstests 068), symlink and hardlink restriction
patches, plus assorted cleanups and fixes.

Note that another fsfreeze deadlock (emergency thaw one) is *not*
dealt with - the series by Fernando conflicts a lot with Jan's, breaks
userland ABI (FIFREEZE semantics gets changed) and trades the deadlock
for massive vfsmount leak; this is going to be handled next cycle.
There probably will be another pull request, but that stuff won't be
in it."

Fix up trivial conflicts due to unrelated changes next to each other in
drivers/{staging/gdm72xx/usb_boot.c, usb/gadget/storage_common.c}

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (54 commits)
delousing target_core_file a bit
Documentation: Correct s_umount state for freeze_fs/unfreeze_fs
fs: Remove old freezing mechanism
ext2: Implement freezing
btrfs: Convert to new freezing mechanism
nilfs2: Convert to new freezing mechanism
ntfs: Convert to new freezing mechanism
fuse: Convert to new freezing mechanism
gfs2: Convert to new freezing mechanism
ocfs2: Convert to new freezing mechanism
xfs: Convert to new freezing code
ext4: Convert to new freezing mechanism
fs: Protect write paths by sb_start_write - sb_end_write
fs: Skip atime update on frozen filesystem
fs: Add freezing handling to mnt_want_write() / mnt_drop_write()
fs: Improve filesystem freezing handling
switch the protection of percpu_counter list to spinlock
nfsd: Push mnt_want_write() outside of i_mutex
btrfs: Push mnt_want_write() outside of i_mutex
fat: Push mnt_want_write() outside of i_mutex
...

+1327 -640
+2 -2
Documentation/filesystems/Locking
··· 138 138 put_super: write 139 139 write_super: read 140 140 sync_fs: read 141 - freeze_fs: read 142 - unfreeze_fs: read 141 + freeze_fs: write 142 + unfreeze_fs: write 143 143 statfs: maybe(read) (see below) 144 144 remount_fs: write 145 145 umount_begin: no
+42
Documentation/sysctl/fs.txt
··· 32 32 - nr_open 33 33 - overflowuid 34 34 - overflowgid 35 + - protected_hardlinks 36 + - protected_symlinks 35 37 - suid_dumpable 36 38 - super-max 37 39 - super-nr ··· 156 154 157 155 These sysctls allow you to change the value of the fixed UID and GID. 158 156 The default is 65534. 157 + 158 + ============================================================== 159 + 160 + protected_hardlinks: 161 + 162 + A long-standing class of security issues is the hardlink-based 163 + time-of-check-time-of-use race, most commonly seen in world-writable 164 + directories like /tmp. The common method of exploitation of this flaw 165 + is to cross privilege boundaries when following a given hardlink (i.e. a 166 + root process follows a hardlink created by another user). Additionally, 167 + on systems without separated partitions, this stops unauthorized users 168 + from "pinning" vulnerable setuid/setgid files against being upgraded by 169 + the administrator, or linking to special files. 170 + 171 + When set to "0", hardlink creation behavior is unrestricted. 172 + 173 + When set to "1" hardlinks cannot be created by users if they do not 174 + already own the source file, or do not have read/write access to it. 175 + 176 + This protection is based on the restrictions in Openwall and grsecurity. 177 + 178 + ============================================================== 179 + 180 + protected_symlinks: 181 + 182 + A long-standing class of security issues is the symlink-based 183 + time-of-check-time-of-use race, most commonly seen in world-writable 184 + directories like /tmp. The common method of exploitation of this flaw 185 + is to cross privilege boundaries when following a given symlink (i.e. a 186 + root process follows a symlink belonging to another user). For a likely 187 + incomplete list of hundreds of examples across the years, please see: 188 + http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp 189 + 190 + When set to "0", symlink following behavior is unrestricted. 191 + 192 + When set to "1" symlinks are permitted to be followed only when outside 193 + a sticky world-writable directory, or when the uid of the symlink and 194 + follower match, or when the directory owner matches the symlink's owner. 195 + 196 + This protection is based on the restrictions in Openwall and grsecurity. 159 197 160 198 ============================================================== 161 199
+25 -52
arch/powerpc/platforms/cell/spufs/inode.c
··· 186 186 static int spufs_rmdir(struct inode *parent, struct dentry *dir) 187 187 { 188 188 /* remove all entries */ 189 + int res; 189 190 spufs_prune_dir(dir); 190 191 d_drop(dir); 191 - 192 - return simple_rmdir(parent, dir); 192 + res = simple_rmdir(parent, dir); 193 + /* We have to give up the mm_struct */ 194 + spu_forget(SPUFS_I(dir->d_inode)->i_ctx); 195 + return res; 193 196 } 194 197 195 198 static int spufs_fill_dir(struct dentry *dir, ··· 247 244 ret = spufs_rmdir(parent, dir); 248 245 mutex_unlock(&parent->i_mutex); 249 246 WARN_ON(ret); 250 - 251 - /* We have to give up the mm_struct */ 252 - spu_forget(ctx); 253 247 254 248 return dcache_dir_close(inode, file); 255 249 } ··· 450 450 struct spu_context *neighbor; 451 451 struct path path = {.mnt = mnt, .dentry = dentry}; 452 452 453 - ret = -EPERM; 454 453 if ((flags & SPU_CREATE_NOSCHED) && 455 454 !capable(CAP_SYS_NICE)) 456 - goto out_unlock; 455 + return -EPERM; 457 456 458 - ret = -EINVAL; 459 457 if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE)) 460 458 == SPU_CREATE_ISOLATE) 461 - goto out_unlock; 459 + return -EINVAL; 462 460 463 - ret = -ENODEV; 464 461 if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader) 465 - goto out_unlock; 462 + return -ENODEV; 466 463 467 464 gang = NULL; 468 465 neighbor = NULL; 469 466 affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU); 470 467 if (affinity) { 471 468 gang = SPUFS_I(inode)->i_gang; 472 - ret = -EINVAL; 473 469 if (!gang) 474 - goto out_unlock; 470 + return -EINVAL; 475 471 mutex_lock(&gang->aff_mutex); 476 472 neighbor = spufs_assert_affinity(flags, gang, aff_filp); 477 473 if (IS_ERR(neighbor)) { ··· 488 492 } 489 493 490 494 ret = spufs_context_open(&path); 491 - if (ret < 0) { 495 + if (ret < 0) 492 496 WARN_ON(spufs_rmdir(inode, dentry)); 493 - if (affinity) 494 - mutex_unlock(&gang->aff_mutex); 495 - mutex_unlock(&inode->i_mutex); 496 - spu_forget(SPUFS_I(dentry->d_inode)->i_ctx); 497 - goto out; 498 - } 499 497 500 498 out_aff_unlock: 501 499 if (affinity) 502 500 mutex_unlock(&gang->aff_mutex); 503 - out_unlock: 504 - mutex_unlock(&inode->i_mutex); 505 - out: 506 - dput(dentry); 507 501 return ret; 508 502 } 509 503 ··· 566 580 int ret; 567 581 568 582 ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO); 569 - if (ret) 570 - goto out; 571 - 572 - ret = spufs_gang_open(&path); 573 - if (ret < 0) { 574 - int err = simple_rmdir(inode, dentry); 575 - WARN_ON(err); 583 + if (!ret) { 584 + ret = spufs_gang_open(&path); 585 + if (ret < 0) { 586 + int err = simple_rmdir(inode, dentry); 587 + WARN_ON(err); 588 + } 576 589 } 577 - 578 - out: 579 - mutex_unlock(&inode->i_mutex); 580 - dput(dentry); 581 590 return ret; 582 591 } 583 592 ··· 582 601 long spufs_create(struct path *path, struct dentry *dentry, 583 602 unsigned int flags, umode_t mode, struct file *filp) 584 603 { 604 + struct inode *dir = path->dentry->d_inode; 585 605 int ret; 586 606 587 - ret = -EINVAL; 588 607 /* check if we are on spufs */ 589 608 if (path->dentry->d_sb->s_type != &spufs_type) 590 - goto out; 609 + return -EINVAL; 591 610 592 611 /* don't accept undefined flags */ 593 612 if (flags & (~SPU_CREATE_FLAG_ALL)) 594 - goto out; 613 + return -EINVAL; 595 614 596 615 /* only threads can be underneath a gang */ 597 - if (path->dentry != path->dentry->d_sb->s_root) { 598 - if ((flags & SPU_CREATE_GANG) || 599 - !SPUFS_I(path->dentry->d_inode)->i_gang) 600 - goto out; 601 - } 616 + if (path->dentry != path->dentry->d_sb->s_root) 617 + if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang) 618 + return -EINVAL; 602 619 603 620 mode &= ~current_umask(); 604 621 605 622 if (flags & SPU_CREATE_GANG) 606 - ret = spufs_create_gang(path->dentry->d_inode, 607 - dentry, path->mnt, mode); 623 + ret = spufs_create_gang(dir, dentry, path->mnt, mode); 608 624 else 609 - ret = spufs_create_context(path->dentry->d_inode, 610 - dentry, path->mnt, flags, mode, 625 + ret = spufs_create_context(dir, dentry, path->mnt, flags, mode, 611 626 filp); 612 627 if (ret >= 0) 613 - fsnotify_mkdir(path->dentry->d_inode, dentry); 614 - return ret; 628 + fsnotify_mkdir(dir, dentry); 615 629 616 - out: 617 - mutex_unlock(&path->dentry->d_inode->i_mutex); 618 - dput(dentry); 619 630 return ret; 620 631 } 621 632
+1 -1
arch/powerpc/platforms/cell/spufs/syscalls.c
··· 70 70 ret = PTR_ERR(dentry); 71 71 if (!IS_ERR(dentry)) { 72 72 ret = spufs_create(&path, dentry, flags, mode, neighbor); 73 - path_put(&path); 73 + done_path_create(&path, dentry); 74 74 } 75 75 76 76 return ret;
+2 -7
drivers/base/devtmpfs.c
··· 156 156 if (!err) 157 157 /* mark as kernel-created inode */ 158 158 dentry->d_inode->i_private = &thread; 159 - dput(dentry); 160 - mutex_unlock(&path.dentry->d_inode->i_mutex); 161 - path_put(&path); 159 + done_path_create(&path, dentry); 162 160 return err; 163 161 } 164 162 ··· 216 218 /* mark as kernel-created inode */ 217 219 dentry->d_inode->i_private = &thread; 218 220 } 219 - dput(dentry); 220 - 221 - mutex_unlock(&path.dentry->d_inode->i_mutex); 222 - path_put(&path); 221 + done_path_create(&path, dentry); 223 222 return err; 224 223 } 225 224
+1 -1
drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
··· 1188 1188 kfree(buf); 1189 1189 /* close file before return */ 1190 1190 if (fp) 1191 - filp_close(fp, current->files); 1191 + filp_close(fp, NULL); 1192 1192 /* restore previous address limit */ 1193 1193 set_fs(old_fs); 1194 1194
+5 -26
drivers/staging/bcm/Misc.c
··· 157 157 158 158 static struct file *open_firmware_file(struct bcm_mini_adapter *Adapter, const char *path) 159 159 { 160 - struct file *flp = NULL; 161 - mm_segment_t oldfs; 162 - oldfs = get_fs(); 163 - set_fs(get_ds()); 164 - flp = filp_open(path, O_RDONLY, S_IRWXU); 165 - set_fs(oldfs); 160 + struct file *flp = filp_open(path, O_RDONLY, S_IRWXU); 166 161 if (IS_ERR(flp)) { 167 162 pr_err(DRV_NAME "Unable To Open File %s, err %ld", path, PTR_ERR(flp)); 168 163 flp = NULL; ··· 178 183 { 179 184 int errorno = 0; 180 185 struct file *flp = NULL; 181 - mm_segment_t oldfs; 182 186 struct timeval tv = {0}; 183 187 184 188 flp = open_firmware_file(Adapter, path); 185 189 if (!flp) { 186 - errorno = -ENOENT; 187 190 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Unable to Open %s\n", path); 188 - goto exit_download; 191 + return -ENOENT; 189 192 } 190 193 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Opened file is = %s and length =0x%lx to be downloaded at =0x%x", path, (unsigned long)flp->f_dentry->d_inode->i_size, loc); 191 194 do_gettimeofday(&tv); ··· 194 201 errorno = -EIO; 195 202 goto exit_download; 196 203 } 197 - oldfs = get_fs(); 198 - set_fs(get_ds()); 199 204 vfs_llseek(flp, 0, 0); 200 - set_fs(oldfs); 201 205 if (Adapter->bcm_file_readback_from_chip(Adapter->pvInterfaceAdapter, flp, loc)) { 202 206 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Failed to read back firmware!"); 203 207 errorno = -EIO; ··· 202 212 } 203 213 204 214 exit_download: 205 - oldfs = get_fs(); 206 - set_fs(get_ds()); 207 - if (flp && !(IS_ERR(flp))) 208 - filp_close(flp, current->files); 209 - set_fs(oldfs); 210 - 215 + filp_close(flp, NULL); 211 216 return errorno; 212 217 } 213 218 ··· 1041 1056 static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter) 1042 1057 { 1043 1058 struct file *flp = NULL; 1044 - mm_segment_t oldfs = {0}; 1045 1059 char *buff; 1046 1060 int len = 0; 1047 - loff_t pos = 0; 1048 1061 1049 1062 buff = kmalloc(BUFFER_1K, GFP_KERNEL); 1050 1063 if (!buff) ··· 1062 1079 Adapter->pstargetparams = NULL; 1063 1080 return -ENOENT; 1064 1081 } 1065 - oldfs = get_fs(); 1066 - set_fs(get_ds()); 1067 - len = vfs_read(flp, (void __user __force *)buff, BUFFER_1K, &pos); 1068 - set_fs(oldfs); 1082 + len = kernel_read(flp, 0, buff, BUFFER_1K); 1083 + filp_close(flp, NULL); 1069 1084 1070 1085 if (len != sizeof(STARGETPARAMS)) { 1071 1086 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Mismatch in Target Param Structure!\n"); 1072 1087 kfree(buff); 1073 1088 kfree(Adapter->pstargetparams); 1074 1089 Adapter->pstargetparams = NULL; 1075 - filp_close(flp, current->files); 1076 1090 return -ENOENT; 1077 1091 } 1078 - filp_close(flp, current->files); 1079 1092 1080 1093 /* Check for autolink in config params */ 1081 1094 /*
+3 -4
drivers/staging/gdm72xx/sdio_boot.c
··· 66 66 return -ENOENT; 67 67 } 68 68 69 - if (filp->f_dentry) 70 - inode = filp->f_dentry->d_inode; 71 - if (!inode || !S_ISREG(inode->i_mode)) { 69 + inode = filp->f_dentry->d_inode; 70 + if (!S_ISREG(inode->i_mode)) { 72 71 printk(KERN_ERR "Invalid file type: %s\n", img_name); 73 72 ret = -EINVAL; 74 73 goto out; ··· 122 123 pno++; 123 124 } 124 125 out: 125 - filp_close(filp, current->files); 126 + filp_close(filp, NULL); 126 127 return ret; 127 128 } 128 129
+9 -13
drivers/staging/gdm72xx/usb_boot.c
··· 173 173 filp = filp_open(img_name, O_RDONLY | O_LARGEFILE, 0); 174 174 if (IS_ERR(filp)) { 175 175 printk(KERN_ERR "Can't find %s.\n", img_name); 176 - set_fs(fs); 177 176 ret = PTR_ERR(filp); 178 177 goto restore_fs; 179 178 } 180 179 181 - if (filp->f_dentry) 182 - inode = filp->f_dentry->d_inode; 183 - if (!inode || !S_ISREG(inode->i_mode)) { 180 + inode = filp->f_dentry->d_inode; 181 + if (!S_ISREG(inode->i_mode)) { 184 182 printk(KERN_ERR "Invalid file type: %s\n", img_name); 185 183 ret = -EINVAL; 186 184 goto out; ··· 260 262 ret = -EINVAL; 261 263 } 262 264 out: 263 - filp_close(filp, current->files); 265 + filp_close(filp, NULL); 264 266 265 267 restore_fs: 266 268 set_fs(fs); ··· 320 322 goto restore_fs; 321 323 } 322 324 323 - if (filp->f_dentry) { 324 - inode = filp->f_dentry->d_inode; 325 - if (!inode || !S_ISREG(inode->i_mode)) { 326 - printk(KERN_ERR "Invalid file type: %s\n", path); 327 - ret = -EINVAL; 328 - goto out; 329 - } 325 + inode = filp->f_dentry->d_inode; 326 + if (!S_ISREG(inode->i_mode)) { 327 + printk(KERN_ERR "Invalid file type: %s\n", path); 328 + ret = -EINVAL; 329 + goto out; 330 330 } 331 331 332 332 buf = kmalloc(DOWNLOAD_CHUCK + pad_size, GFP_KERNEL); ··· 360 364 goto out; 361 365 362 366 out: 363 - filp_close(filp, current->files); 367 + filp_close(filp, NULL); 364 368 365 369 restore_fs: 366 370 set_fs(fs);
+5 -27
drivers/target/target_core_file.c
··· 109 109 struct se_subsystem_dev *se_dev, 110 110 void *p) 111 111 { 112 - char *dev_p = NULL; 113 112 struct se_device *dev; 114 113 struct se_dev_limits dev_limits; 115 114 struct queue_limits *limits; 116 115 struct fd_dev *fd_dev = p; 117 116 struct fd_host *fd_host = hba->hba_ptr; 118 - mm_segment_t old_fs; 119 117 struct file *file; 120 118 struct inode *inode = NULL; 121 119 int dev_flags = 0, flags, ret = -EINVAL; 122 120 123 121 memset(&dev_limits, 0, sizeof(struct se_dev_limits)); 124 122 125 - old_fs = get_fs(); 126 - set_fs(get_ds()); 127 - dev_p = getname(fd_dev->fd_dev_name); 128 - set_fs(old_fs); 129 - 130 - if (IS_ERR(dev_p)) { 131 - pr_err("getname(%s) failed: %lu\n", 132 - fd_dev->fd_dev_name, IS_ERR(dev_p)); 133 - ret = PTR_ERR(dev_p); 134 - goto fail; 135 - } 136 123 /* 137 124 * Use O_DSYNC by default instead of O_SYNC to forgo syncing 138 125 * of pure timestamp updates. 139 126 */ 140 127 flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; 141 128 142 - file = filp_open(dev_p, flags, 0600); 129 + file = filp_open(fd_dev->fd_dev_name, flags, 0600); 143 130 if (IS_ERR(file)) { 144 - pr_err("filp_open(%s) failed\n", dev_p); 131 + pr_err("filp_open(%s) failed\n", fd_dev->fd_dev_name); 145 132 ret = PTR_ERR(file); 146 - goto fail; 147 - } 148 - if (!file || !file->f_dentry) { 149 - pr_err("filp_open(%s) failed\n", dev_p); 150 133 goto fail; 151 134 } 152 135 fd_dev->fd_file = file; ··· 195 212 " %llu total bytes\n", fd_host->fd_host_id, fd_dev->fd_dev_id, 196 213 fd_dev->fd_dev_name, fd_dev->fd_dev_size); 197 214 198 - putname(dev_p); 199 215 return dev; 200 216 fail: 201 217 if (fd_dev->fd_file) { 202 218 filp_close(fd_dev->fd_file, NULL); 203 219 fd_dev->fd_file = NULL; 204 220 } 205 - putname(dev_p); 206 221 return ERR_PTR(ret); 207 222 } 208 223 ··· 433 452 token = match_token(ptr, tokens, args); 434 453 switch (token) { 435 454 case Opt_fd_dev_name: 436 - arg_p = match_strdup(&args[0]); 437 - if (!arg_p) { 438 - ret = -ENOMEM; 455 + if (match_strlcpy(fd_dev->fd_dev_name, &args[0], 456 + FD_MAX_DEV_NAME) == 0) { 457 + ret = -EINVAL; 439 458 break; 440 459 } 441 - snprintf(fd_dev->fd_dev_name, FD_MAX_DEV_NAME, 442 - "%s", arg_p); 443 - kfree(arg_p); 444 460 pr_debug("FILEIO: Referencing Path: %s\n", 445 461 fd_dev->fd_dev_name); 446 462 fd_dev->fbd_flags |= FBDF_HAS_PATH;
+5 -7
drivers/usb/gadget/storage_common.c
··· 656 656 if (!(filp->f_mode & FMODE_WRITE)) 657 657 ro = 1; 658 658 659 - if (filp->f_path.dentry) 660 - inode = filp->f_path.dentry->d_inode; 661 - if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { 659 + inode = filp->f_path.dentry->d_inode; 660 + if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { 662 661 LINFO(curlun, "invalid file type: %s\n", filename); 663 662 goto out; 664 663 } ··· 666 667 * If we can't read the file, it's no good. 667 668 * If we can't write the file, use it read-only. 668 669 */ 669 - if (!filp->f_op || !(filp->f_op->read || filp->f_op->aio_read)) { 670 + if (!(filp->f_op->read || filp->f_op->aio_read)) { 670 671 LINFO(curlun, "file not readable: %s\n", filename); 671 672 goto out; 672 673 } ··· 711 712 if (fsg_lun_is_open(curlun)) 712 713 fsg_lun_close(curlun); 713 714 714 - get_file(filp); 715 715 curlun->blksize = blksize; 716 716 curlun->blkbits = blkbits; 717 717 curlun->ro = ro; ··· 718 720 curlun->file_length = size; 719 721 curlun->num_sectors = num_sectors; 720 722 LDBG(curlun, "open backing file: %s\n", filename); 721 - rc = 0; 723 + return 0; 722 724 723 725 out: 724 - filp_close(filp, current->files); 726 + fput(filp); 725 727 return rc; 726 728 } 727 729
+3 -3
drivers/usb/gadget/u_uac1.c
··· 275 275 /* Close control device */ 276 276 snd = &gau->control; 277 277 if (snd->filp) 278 - filp_close(snd->filp, current->files); 278 + filp_close(snd->filp, NULL); 279 279 280 280 /* Close PCM playback device and setup substream */ 281 281 snd = &gau->playback; 282 282 if (snd->filp) 283 - filp_close(snd->filp, current->files); 283 + filp_close(snd->filp, NULL); 284 284 285 285 /* Close PCM capture device and setup substream */ 286 286 snd = &gau->capture; 287 287 if (snd->filp) 288 - filp_close(snd->filp, current->files); 288 + filp_close(snd->filp, NULL); 289 289 290 290 return 0; 291 291 }
+2
drivers/video/fb_defio.c
··· 104 104 deferred framebuffer IO. then if userspace touches a page 105 105 again, we repeat the same scheme */ 106 106 107 + file_update_time(vma->vm_file); 108 + 107 109 /* protect against the workqueue changing the page list */ 108 110 mutex_lock(&fbdefio->lock); 109 111
+3
fs/9p/vfs_file.c
··· 610 610 p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", 611 611 page, (unsigned long)filp->private_data); 612 612 613 + /* Update file times before taking page lock */ 614 + file_update_time(filp); 615 + 613 616 v9inode = V9FS_I(inode); 614 617 /* make sure the cache has finished storing the page */ 615 618 v9fs_fscache_wait_on_page_write(inode, page);
-3
fs/btrfs/disk-io.c
··· 1614 1614 struct btrfs_root *root = arg; 1615 1615 1616 1616 do { 1617 - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1618 - 1619 1617 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1620 1618 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1621 1619 btrfs_run_delayed_iputs(root); ··· 1645 1647 do { 1646 1648 cannot_commit = false; 1647 1649 delay = HZ * 30; 1648 - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1649 1650 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1650 1651 1651 1652 spin_lock(&root->fs_info->trans_lock);
+2 -1
fs/btrfs/file.c
··· 1379 1379 ssize_t err = 0; 1380 1380 size_t count, ocount; 1381 1381 1382 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1382 + sb_start_write(inode->i_sb); 1383 1383 1384 1384 mutex_lock(&inode->i_mutex); 1385 1385 ··· 1469 1469 num_written = err; 1470 1470 } 1471 1471 out: 1472 + sb_end_write(inode->i_sb); 1472 1473 current->backing_dev_info = NULL; 1473 1474 return num_written ? num_written : err; 1474 1475 }
+5 -1
fs/btrfs/inode.c
··· 6629 6629 u64 page_start; 6630 6630 u64 page_end; 6631 6631 6632 + sb_start_pagefault(inode->i_sb); 6632 6633 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6633 6634 if (!ret) { 6634 6635 ret = file_update_time(vma->vm_file); ··· 6719 6718 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6720 6719 6721 6720 out_unlock: 6722 - if (!ret) 6721 + if (!ret) { 6722 + sb_end_pagefault(inode->i_sb); 6723 6723 return VM_FAULT_LOCKED; 6724 + } 6724 6725 unlock_page(page); 6725 6726 out: 6726 6727 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6727 6728 out_noreserve: 6729 + sb_end_pagefault(inode->i_sb); 6728 6730 return ret; 6729 6731 } 6730 6732
+10 -5
fs/btrfs/ioctl.c
··· 195 195 if (!inode_owner_or_capable(inode)) 196 196 return -EACCES; 197 197 198 + ret = mnt_want_write_file(file); 199 + if (ret) 200 + return ret; 201 + 198 202 mutex_lock(&inode->i_mutex); 199 203 200 204 ip_oldflags = ip->flags; ··· 212 208 goto out_unlock; 213 209 } 214 210 } 215 - 216 - ret = mnt_want_write_file(file); 217 - if (ret) 218 - goto out_unlock; 219 211 220 212 if (flags & FS_SYNC_FL) 221 213 ip->flags |= BTRFS_INODE_SYNC; ··· 275 275 inode->i_flags = i_oldflags; 276 276 } 277 277 278 - mnt_drop_write_file(file); 279 278 out_unlock: 280 279 mutex_unlock(&inode->i_mutex); 280 + mnt_drop_write_file(file); 281 281 return ret; 282 282 } 283 283 ··· 664 664 struct dentry *dentry; 665 665 int error; 666 666 667 + error = mnt_want_write(parent->mnt); 668 + if (error) 669 + return error; 670 + 667 671 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 668 672 669 673 dentry = lookup_one_len(name, parent->dentry, namelen); ··· 703 699 dput(dentry); 704 700 out_unlock: 705 701 mutex_unlock(&dir->i_mutex); 702 + mnt_drop_write(parent->mnt); 706 703 return error; 707 704 } 708 705
+7
fs/btrfs/transaction.c
··· 335 335 if (!h) 336 336 return ERR_PTR(-ENOMEM); 337 337 338 + sb_start_intwrite(root->fs_info->sb); 339 + 338 340 if (may_wait_transaction(root, type)) 339 341 wait_current_trans(root); 340 342 ··· 347 345 } while (ret == -EBUSY); 348 346 349 347 if (ret < 0) { 348 + sb_end_intwrite(root->fs_info->sb); 350 349 kmem_cache_free(btrfs_trans_handle_cachep, h); 351 350 return ERR_PTR(ret); 352 351 } ··· 550 547 } 551 548 btrfs_trans_release_metadata(trans, root); 552 549 trans->block_rsv = NULL; 550 + 551 + sb_end_intwrite(root->fs_info->sb); 553 552 554 553 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 555 554 should_end_transaction(trans, root)) { ··· 1582 1577 1583 1578 put_transaction(cur_trans); 1584 1579 put_transaction(cur_trans); 1580 + 1581 + sb_end_intwrite(root->fs_info->sb); 1585 1582 1586 1583 trace_btrfs_transaction_commit(root); 1587 1584
+10 -18
fs/buffer.c
··· 2306 2306 * beyond EOF, then the page is guaranteed safe against truncation until we 2307 2307 * unlock the page. 2308 2308 * 2309 - * Direct callers of this function should call vfs_check_frozen() so that page 2310 - * fault does not busyloop until the fs is thawed. 2309 + * Direct callers of this function should protect against filesystem freezing 2310 + * using sb_start_write() - sb_end_write() functions. 2311 2311 */ 2312 2312 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2313 2313 get_block_t get_block) ··· 2317 2317 unsigned long end; 2318 2318 loff_t size; 2319 2319 int ret; 2320 + 2321 + /* 2322 + * Update file times before taking page lock. We may end up failing the 2323 + * fault so this update may be superfluous but who really cares... 2324 + */ 2325 + file_update_time(vma->vm_file); 2320 2326 2321 2327 lock_page(page); 2322 2328 size = i_size_read(inode); ··· 2345 2339 2346 2340 if (unlikely(ret < 0)) 2347 2341 goto out_unlock; 2348 - /* 2349 - * Freezing in progress? We check after the page is marked dirty and 2350 - * with page lock held so if the test here fails, we are sure freezing 2351 - * code will wait during syncing until the page fault is done - at that 2352 - * point page will be dirty and unlocked so freezing code will write it 2353 - * and writeprotect it again. 2354 - */ 2355 2342 set_page_dirty(page); 2356 - if (inode->i_sb->s_frozen != SB_UNFROZEN) { 2357 - ret = -EAGAIN; 2358 - goto out_unlock; 2359 - } 2360 2343 wait_on_page_writeback(page); 2361 2344 return 0; 2362 2345 out_unlock: ··· 2360 2365 int ret; 2361 2366 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2362 2367 2363 - /* 2364 - * This check is racy but catches the common case. The check in 2365 - * __block_page_mkwrite() is reliable. 2366 - */ 2367 - vfs_check_frozen(sb, SB_FREEZE_WRITE); 2368 + sb_start_pagefault(sb); 2368 2369 ret = __block_page_mkwrite(vma, vmf, get_block); 2370 + sb_end_pagefault(sb); 2369 2371 return block_page_mkwrite_return(ret); 2370 2372 } 2371 2373 EXPORT_SYMBOL(block_page_mkwrite);
+3
fs/ceph/addr.c
··· 1184 1184 loff_t size, len; 1185 1185 int ret; 1186 1186 1187 + /* Update time before taking page lock */ 1188 + file_update_time(vma->vm_file); 1189 + 1187 1190 size = i_size_read(inode); 1188 1191 if (off + PAGE_CACHE_SIZE <= size) 1189 1192 len = PAGE_CACHE_SIZE;
+10 -20
fs/ecryptfs/inode.c
··· 318 318 struct vfsmount *lower_mnt; 319 319 int rc = 0; 320 320 321 - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 322 - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 323 - BUG_ON(!lower_dentry->d_count); 324 - 325 321 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 326 - ecryptfs_set_dentry_private(dentry, dentry_info); 327 322 if (!dentry_info) { 328 323 printk(KERN_ERR "%s: Out of memory whilst attempting " 329 324 "to allocate ecryptfs_dentry_info struct\n", 330 325 __func__); 331 326 dput(lower_dentry); 332 - mntput(lower_mnt); 333 - d_drop(dentry); 334 327 return -ENOMEM; 335 328 } 329 + 330 + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 331 + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 332 + BUG_ON(!lower_dentry->d_count); 333 + 334 + ecryptfs_set_dentry_private(dentry, dentry_info); 336 335 ecryptfs_set_dentry_lower(dentry, lower_dentry); 337 336 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 338 337 ··· 380 381 struct dentry *lower_dir_dentry, *lower_dentry; 381 382 int rc = 0; 382 383 383 - if ((ecryptfs_dentry->d_name.len == 1 384 - && !strcmp(ecryptfs_dentry->d_name.name, ".")) 385 - || (ecryptfs_dentry->d_name.len == 2 386 - && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { 387 - goto out_d_drop; 388 - } 389 384 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 390 385 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 391 386 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, ··· 390 397 rc = PTR_ERR(lower_dentry); 391 398 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 392 399 "[%d] on lower_dentry = [%s]\n", __func__, rc, 393 - encrypted_and_encoded_name); 394 - goto out_d_drop; 400 + ecryptfs_dentry->d_name.name); 401 + goto out; 395 402 } 396 403 if (lower_dentry->d_inode) 397 404 goto interpose; ··· 408 415 if (rc) { 409 416 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 410 417 "filename; rc = [%d]\n", __func__, rc); 411 - goto out_d_drop; 418 + goto out; 412 419 } 413 420 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 414 421 lower_dentry = lookup_one_len(encrypted_and_encoded_name, ··· 420 427 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 421 428 "[%d] on lower_dentry = [%s]\n", __func__, rc, 422 429 encrypted_and_encoded_name); 423 - goto out_d_drop; 430 + goto out; 424 431 } 425 432 interpose: 426 433 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, 427 434 ecryptfs_dir_inode); 428 - goto out; 429 - out_d_drop: 430 - d_drop(ecryptfs_dentry); 431 435 out: 432 436 kfree(encrypted_and_encoded_name); 433 437 return ERR_PTR(rc);
+6 -13
fs/exec.c
··· 2069 2069 */ 2070 2070 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) 2071 2071 { 2072 - struct file *rp, *wp; 2072 + struct file *files[2]; 2073 2073 struct fdtable *fdt; 2074 2074 struct coredump_params *cp = (struct coredump_params *)info->data; 2075 2075 struct files_struct *cf = current->files; 2076 + int err = create_pipe_files(files, 0); 2077 + if (err) 2078 + return err; 2076 2079 2077 - wp = create_write_pipe(0); 2078 - if (IS_ERR(wp)) 2079 - return PTR_ERR(wp); 2080 - 2081 - rp = create_read_pipe(wp, 0); 2082 - if (IS_ERR(rp)) { 2083 - free_write_pipe(wp); 2084 - return PTR_ERR(rp); 2085 - } 2086 - 2087 - cp->file = wp; 2080 + cp->file = files[1]; 2088 2081 2089 2082 sys_close(0); 2090 - fd_install(0, rp); 2083 + fd_install(0, files[0]); 2091 2084 spin_lock(&cf->file_lock); 2092 2085 fdt = files_fdtable(cf); 2093 2086 __set_open_fd(0, fdt);
+4 -1
fs/ext2/inode.c
··· 79 79 truncate_inode_pages(&inode->i_data, 0); 80 80 81 81 if (want_delete) { 82 + sb_start_intwrite(inode->i_sb); 82 83 /* set dtime */ 83 84 EXT2_I(inode)->i_dtime = get_seconds(); 84 85 mark_inode_dirty(inode); ··· 99 98 if (unlikely(rsv)) 100 99 kfree(rsv); 101 100 102 - if (want_delete) 101 + if (want_delete) { 103 102 ext2_free_inode(inode); 103 + sb_end_intwrite(inode->i_sb); 104 + } 104 105 } 105 106 106 107 typedef struct {
+33
fs/ext2/super.c
··· 42 42 static int ext2_remount (struct super_block * sb, int * flags, char * data); 43 43 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 44 44 static int ext2_sync_fs(struct super_block *sb, int wait); 45 + static int ext2_freeze(struct super_block *sb); 46 + static int ext2_unfreeze(struct super_block *sb); 45 47 46 48 void ext2_error(struct super_block *sb, const char *function, 47 49 const char *fmt, ...) ··· 307 305 .evict_inode = ext2_evict_inode, 308 306 .put_super = ext2_put_super, 309 307 .sync_fs = ext2_sync_fs, 308 + .freeze_fs = ext2_freeze, 309 + .unfreeze_fs = ext2_unfreeze, 310 310 .statfs = ext2_statfs, 311 311 .remount_fs = ext2_remount, 312 312 .show_options = ext2_show_options, ··· 1204 1200 return 0; 1205 1201 } 1206 1202 1203 + static int ext2_freeze(struct super_block *sb) 1204 + { 1205 + struct ext2_sb_info *sbi = EXT2_SB(sb); 1206 + 1207 + /* 1208 + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared 1209 + * because we have unattached inodes and thus filesystem is not fully 1210 + * consistent. 1211 + */ 1212 + if (atomic_long_read(&sb->s_remove_count)) { 1213 + ext2_sync_fs(sb, 1); 1214 + return 0; 1215 + } 1216 + /* Set EXT2_FS_VALID flag */ 1217 + spin_lock(&sbi->s_lock); 1218 + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); 1219 + spin_unlock(&sbi->s_lock); 1220 + ext2_sync_super(sb, sbi->s_es, 1); 1221 + 1222 + return 0; 1223 + } 1224 + 1225 + static int ext2_unfreeze(struct super_block *sb) 1226 + { 1227 + /* Just write sb to clear EXT2_VALID_FS flag */ 1228 + ext2_write_super(sb); 1229 + 1230 + return 0; 1231 + } 1207 1232 1208 1233 void ext2_write_super(struct super_block *sb) 1209 1234 {
+10 -5
fs/ext4/inode.c
··· 233 233 if (is_bad_inode(inode)) 234 234 goto no_delete; 235 235 236 + /* 237 + * Protect us against freezing - iput() caller didn't have to have any 238 + * protection against it 239 + */ 240 + sb_start_intwrite(inode->i_sb); 236 241 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); 237 242 if (IS_ERR(handle)) { 238 243 ext4_std_error(inode->i_sb, PTR_ERR(handle)); ··· 247 242 * cleaned up. 248 243 */ 249 244 ext4_orphan_del(NULL, inode); 245 + sb_end_intwrite(inode->i_sb); 250 246 goto no_delete; 251 247 } 252 248 ··· 279 273 stop_handle: 280 274 ext4_journal_stop(handle); 281 275 ext4_orphan_del(NULL, inode); 276 + sb_end_intwrite(inode->i_sb); 282 277 goto no_delete; 283 278 } 284 279 } ··· 308 301 else 309 302 ext4_free_inode(handle, inode); 310 303 ext4_journal_stop(handle); 304 + sb_end_intwrite(inode->i_sb); 311 305 return; 312 306 no_delete: 313 307 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ ··· 4787 4779 get_block_t *get_block; 4788 4780 int retries = 0; 4789 4781 4790 - /* 4791 - * This check is racy but catches the common case. We rely on 4792 - * __block_page_mkwrite() to do a reliable check. 4793 - */ 4794 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 4782 + sb_start_pagefault(inode->i_sb); 4795 4783 /* Delalloc case is easy... */ 4796 4784 if (test_opt(inode->i_sb, DELALLOC) && 4797 4785 !ext4_should_journal_data(inode) && ··· 4855 4851 out_ret: 4856 4852 ret = block_page_mkwrite_return(ret); 4857 4853 out: 4854 + sb_end_pagefault(inode->i_sb); 4858 4855 return ret; 4859 4856 }
+6
fs/ext4/mmp.c
··· 44 44 { 45 45 struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); 46 46 47 + /* 48 + * We protect against freezing so that we don't create dirty buffers 49 + * on frozen filesystem. 50 + */ 51 + sb_start_write(sb); 47 52 ext4_mmp_csum_set(sb, mmp); 48 53 mark_buffer_dirty(bh); 49 54 lock_buffer(bh); ··· 56 51 get_bh(bh); 57 52 submit_bh(WRITE_SYNC, bh); 58 53 wait_on_buffer(bh); 54 + sb_end_write(sb); 59 55 if (unlikely(!buffer_uptodate(bh))) 60 56 return 1; 61 57
+7 -24
fs/ext4/super.c
··· 331 331 * journal_end calls result in the superblock being marked dirty, so 332 332 * that sync() will call the filesystem's write_super callback if 333 333 * appropriate. 334 - * 335 - * To avoid j_barrier hold in userspace when a user calls freeze(), 336 - * ext4 prevents a new handle from being started by s_frozen, which 337 - * is in an upper layer. 338 334 */ 339 335 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 340 336 { 341 337 journal_t *journal; 342 - handle_t *handle; 343 338 344 339 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 345 340 if (sb->s_flags & MS_RDONLY) 346 341 return ERR_PTR(-EROFS); 347 342 343 + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 348 344 journal = EXT4_SB(sb)->s_journal; 349 - handle = ext4_journal_current_handle(); 350 - 351 - /* 352 - * If a handle has been started, it should be allowed to 353 - * finish, otherwise deadlock could happen between freeze 354 - * and others(e.g. truncate) due to the restart of the 355 - * journal handle if the filesystem is forzen and active 356 - * handles are not stopped. 357 - */ 358 - if (!handle) 359 - vfs_check_frozen(sb, SB_FREEZE_TRANS); 360 - 361 345 if (!journal) 362 346 return ext4_get_nojournal(); 363 347 /* ··· 2731 2747 sb = elr->lr_super; 2732 2748 ngroups = EXT4_SB(sb)->s_groups_count; 2733 2749 2750 + sb_start_write(sb); 2734 2751 for (group = elr->lr_next_group; group < ngroups; group++) { 2735 2752 gdp = ext4_get_group_desc(sb, group, NULL); 2736 2753 if (!gdp) { ··· 2758 2773 elr->lr_next_sched = jiffies + elr->lr_timeout; 2759 2774 elr->lr_next_group = group + 1; 2760 2775 } 2776 + sb_end_write(sb); 2761 2777 2762 2778 return ret; 2763 2779 } ··· 4446 4460 return 0; 4447 4461 4448 4462 journal = EXT4_SB(sb)->s_journal; 4449 - if (journal) { 4450 - vfs_check_frozen(sb, SB_FREEZE_TRANS); 4463 + if (journal) 4451 4464 ret = ext4_journal_force_commit(journal); 4452 - } 4453 4465 4454 4466 return ret; 4455 4467 } ··· 4477 4493 * gives us a chance to flush the journal completely and mark the fs clean. 4478 4494 * 4479 4495 * Note that only this function cannot bring a filesystem to be in a clean 4480 - * state independently, because ext4 prevents a new handle from being started 4481 - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from 4482 - * the upper layer. 4496 + * state independently. It relies on upper layer to stop all data & metadata 4497 + * modifications. 4483 4498 */ 4484 4499 static int ext4_freeze(struct super_block *sb) 4485 4500 { ··· 4505 4522 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4506 4523 error = ext4_commit_super(sb, 1); 4507 4524 out: 4508 - /* we rely on s_frozen to stop further updates */ 4525 + /* we rely on upper layer to stop further updates */ 4509 4526 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4510 4527 return error; 4511 4528 }
+7 -8
fs/fat/file.c
··· 43 43 if (err) 44 44 goto out; 45 45 46 - mutex_lock(&inode->i_mutex); 47 46 err = mnt_want_write_file(file); 48 47 if (err) 49 - goto out_unlock_inode; 48 + goto out; 49 + mutex_lock(&inode->i_mutex); 50 50 51 51 /* 52 52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also ··· 73 73 /* The root directory has no attributes */ 74 74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { 75 75 err = -EINVAL; 76 - goto out_drop_write; 76 + goto out_unlock_inode; 77 77 } 78 78 79 79 if (sbi->options.sys_immutable && 80 80 ((attr | oldattr) & ATTR_SYS) && 81 81 !capable(CAP_LINUX_IMMUTABLE)) { 82 82 err = -EPERM; 83 - goto out_drop_write; 83 + goto out_unlock_inode; 84 84 } 85 85 86 86 /* ··· 90 90 */ 91 91 err = security_inode_setattr(file->f_path.dentry, &ia); 92 92 if (err) 93 - goto out_drop_write; 93 + goto out_unlock_inode; 94 94 95 95 /* This MUST be done before doing anything irreversible... */ 96 96 err = fat_setattr(file->f_path.dentry, &ia); 97 97 if (err) 98 - goto out_drop_write; 98 + goto out_unlock_inode; 99 99 100 100 fsnotify_change(file->f_path.dentry, ia.ia_valid); 101 101 if (sbi->options.sys_immutable) { ··· 107 107 108 108 fat_save_attrs(inode, attr); 109 109 mark_inode_dirty(inode); 110 - out_drop_write: 111 - mnt_drop_write_file(file); 112 110 out_unlock_inode: 113 111 mutex_unlock(&inode->i_mutex); 112 + mnt_drop_write_file(file); 114 113 out: 115 114 return err; 116 115 }
+2 -2
fs/file_table.c
··· 43 43 44 44 static struct percpu_counter nr_files __cacheline_aligned_in_smp; 45 45 46 - static inline void file_free_rcu(struct rcu_head *head) 46 + static void file_free_rcu(struct rcu_head *head) 47 47 { 48 48 struct file *f = container_of(head, struct file, f_u.fu_rcuhead); 49 49 ··· 217 217 return; 218 218 if (file_check_writeable(file) != 0) 219 219 return; 220 - mnt_drop_write(mnt); 220 + __mnt_drop_write(mnt); 221 221 file_release_write(file); 222 222 } 223 223
+2 -2
fs/fuse/file.c
··· 944 944 return err; 945 945 946 946 count = ocount; 947 - 947 + sb_start_write(inode->i_sb); 948 948 mutex_lock(&inode->i_mutex); 949 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 950 949 951 950 /* We can write back this queue in page reclaim */ 952 951 current->backing_dev_info = mapping->backing_dev_info; ··· 1003 1004 out: 1004 1005 current->backing_dev_info = NULL; 1005 1006 mutex_unlock(&inode->i_mutex); 1007 + sb_end_write(inode->i_sb); 1006 1008 1007 1009 return written ? written : err; 1008 1010 }
+6 -12
fs/gfs2/file.c
··· 373 373 loff_t size; 374 374 int ret; 375 375 376 - /* Wait if fs is frozen. This is racy so we check again later on 377 - * and retry if the fs has been frozen after the page lock has 378 - * been acquired 379 - */ 380 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 376 + sb_start_pagefault(inode->i_sb); 377 + 378 + /* Update file times before taking page lock */ 379 + file_update_time(vma->vm_file); 381 380 382 381 ret = gfs2_rs_alloc(ip); 383 382 if (ret) ··· 461 462 gfs2_holder_uninit(&gh); 462 463 if (ret == 0) { 463 464 set_page_dirty(page); 464 - /* This check must be post dropping of transaction lock */ 465 - if (inode->i_sb->s_frozen == SB_UNFROZEN) { 466 - wait_on_page_writeback(page); 467 - } else { 468 - ret = -EAGAIN; 469 - unlock_page(page); 470 - } 465 + wait_on_page_writeback(page); 471 466 } 467 + sb_end_pagefault(inode->i_sb); 472 468 return block_page_mkwrite_return(ret); 473 469 } 474 470
+4
fs/gfs2/trans.c
··· 50 50 if (revokes) 51 51 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 52 52 sizeof(u64)); 53 + sb_start_intwrite(sdp->sd_vfs); 53 54 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 54 55 55 56 error = gfs2_glock_nq(&tr->tr_t_gh); ··· 69 68 gfs2_glock_dq(&tr->tr_t_gh); 70 69 71 70 fail_holder_uninit: 71 + sb_end_intwrite(sdp->sd_vfs); 72 72 gfs2_holder_uninit(&tr->tr_t_gh); 73 73 kfree(tr); 74 74 ··· 118 116 gfs2_holder_uninit(&tr->tr_t_gh); 119 117 kfree(tr); 120 118 } 119 + sb_end_intwrite(sdp->sd_vfs); 121 120 return; 122 121 } 123 122 ··· 139 136 140 137 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) 141 138 gfs2_log_flush(sdp, NULL); 139 + sb_end_intwrite(sdp->sd_vfs); 142 140 } 143 141 144 142 /**
+8 -4
fs/inode.c
··· 1542 1542 if (timespec_equal(&inode->i_atime, &now)) 1543 1543 return; 1544 1544 1545 - if (mnt_want_write(mnt)) 1545 + if (!sb_start_write_trylock(inode->i_sb)) 1546 1546 return; 1547 1547 1548 + if (__mnt_want_write(mnt)) 1549 + goto skip_update; 1548 1550 /* 1549 1551 * File systems can error out when updating inodes if they need to 1550 1552 * allocate new space to modify an inode (such is the case for ··· 1557 1555 * of the fs read only, e.g. subvolumes in Btrfs. 1558 1556 */ 1559 1557 update_time(inode, &now, S_ATIME); 1560 - mnt_drop_write(mnt); 1558 + __mnt_drop_write(mnt); 1559 + skip_update: 1560 + sb_end_write(inode->i_sb); 1561 1561 } 1562 1562 EXPORT_SYMBOL(touch_atime); 1563 1563 ··· 1666 1662 return 0; 1667 1663 1668 1664 /* Finally allowed to write? Takes lock. */ 1669 - if (mnt_want_write_file(file)) 1665 + if (__mnt_want_write_file(file)) 1670 1666 return 0; 1671 1667 1672 1668 ret = update_time(inode, &now, sync_it); 1673 - mnt_drop_write_file(file); 1669 + __mnt_drop_write_file(file); 1674 1670 1675 1671 return ret; 1676 1672 }
+4
fs/internal.h
··· 61 61 62 62 extern struct lglock vfsmount_lock; 63 63 64 + extern int __mnt_want_write(struct vfsmount *); 65 + extern int __mnt_want_write_file(struct file *); 66 + extern void __mnt_drop_write(struct vfsmount *); 67 + extern void __mnt_drop_write_file(struct file *); 64 68 65 69 /* 66 70 * fs_struct.c
+7 -7
fs/lockd/clntproc.c
··· 156 156 struct nlm_rqst *call; 157 157 int status; 158 158 159 - nlm_get_host(host); 160 159 call = nlm_alloc_call(host); 161 160 if (call == NULL) 162 161 return -ENOMEM; 163 162 164 163 nlmclnt_locks_init_private(fl, host); 164 + if (!fl->fl_u.nfs_fl.owner) { 165 + /* lockowner allocation has failed */ 166 + nlmclnt_release_call(call); 167 + return -ENOMEM; 168 + } 165 169 /* Set up the argument struct */ 166 170 nlmclnt_setlockargs(call, fl); 167 171 ··· 189 185 190 186 /* 191 187 * Allocate an NLM RPC call struct 192 - * 193 - * Note: the caller must hold a reference to host. In case of failure, 194 - * this reference will be released. 195 188 */ 196 189 struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) 197 190 { ··· 200 199 atomic_set(&call->a_count, 1); 201 200 locks_init_lock(&call->a_args.lock.fl); 202 201 locks_init_lock(&call->a_res.lock.fl); 203 - call->a_host = host; 202 + call->a_host = nlm_get_host(host); 204 203 return call; 205 204 } 206 205 if (signalled()) ··· 208 207 printk("nlm_alloc_call: failed, waiting for memory\n"); 209 208 schedule_timeout_interruptible(5*HZ); 210 209 } 211 - nlmclnt_release_host(host); 212 210 return NULL; 213 211 } 214 212 ··· 750 750 dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" 751 751 " Attempting to cancel lock.\n"); 752 752 753 - req = nlm_alloc_call(nlm_get_host(host)); 753 + req = nlm_alloc_call(host); 754 754 if (!req) 755 755 return -ENOMEM; 756 756 req->a_flags = RPC_TASK_ASYNC;
+1
fs/lockd/svc4proc.c
··· 257 257 return rpc_system_err; 258 258 259 259 call = nlm_alloc_call(host); 260 + nlmsvc_release_host(host); 260 261 if (call == NULL) 261 262 return rpc_system_err; 262 263
-1
fs/lockd/svclock.c
··· 219 219 struct nlm_block *block; 220 220 struct nlm_rqst *call = NULL; 221 221 222 - nlm_get_host(host); 223 222 call = nlm_alloc_call(host); 224 223 if (call == NULL) 225 224 return NULL;
+1
fs/lockd/svcproc.c
··· 297 297 return rpc_system_err; 298 298 299 299 call = nlm_alloc_call(host); 300 + nlmsvc_release_host(host); 300 301 if (call == NULL) 301 302 return rpc_system_err; 302 303
+213 -100
fs/namei.c
··· 650 650 path_put(link); 651 651 } 652 652 653 + int sysctl_protected_symlinks __read_mostly = 1; 654 + int sysctl_protected_hardlinks __read_mostly = 1; 655 + 656 + /** 657 + * may_follow_link - Check symlink following for unsafe situations 658 + * @link: The path of the symlink 659 + * 660 + * In the case of the sysctl_protected_symlinks sysctl being enabled, 661 + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is 662 + * in a sticky world-writable directory. This is to protect privileged 663 + * processes from failing races against path names that may change out 664 + * from under them by way of other users creating malicious symlinks. 665 + * It will permit symlinks to be followed only when outside a sticky 666 + * world-writable directory, or when the uid of the symlink and follower 667 + * match, or when the directory owner matches the symlink's owner. 668 + * 669 + * Returns 0 if following the symlink is allowed, -ve on error. 670 + */ 671 + static inline int may_follow_link(struct path *link, struct nameidata *nd) 672 + { 673 + const struct inode *inode; 674 + const struct inode *parent; 675 + 676 + if (!sysctl_protected_symlinks) 677 + return 0; 678 + 679 + /* Allowed if owner and follower match. */ 680 + inode = link->dentry->d_inode; 681 + if (current_cred()->fsuid == inode->i_uid) 682 + return 0; 683 + 684 + /* Allowed if parent directory not sticky and world-writable. */ 685 + parent = nd->path.dentry->d_inode; 686 + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) 687 + return 0; 688 + 689 + /* Allowed if parent directory and link owner match. */ 690 + if (parent->i_uid == inode->i_uid) 691 + return 0; 692 + 693 + path_put_conditional(link, nd); 694 + path_put(&nd->path); 695 + audit_log_link_denied("follow_link", link); 696 + return -EACCES; 697 + } 698 + 699 + /** 700 + * safe_hardlink_source - Check for safe hardlink conditions 701 + * @inode: the source inode to hardlink from 702 + * 703 + * Return false if at least one of the following conditions: 704 + * - inode is not a regular file 705 + * - inode is setuid 706 + * - inode is setgid and group-exec 707 + * - access failure for read and write 708 + * 709 + * Otherwise returns true. 710 + */ 711 + static bool safe_hardlink_source(struct inode *inode) 712 + { 713 + umode_t mode = inode->i_mode; 714 + 715 + /* Special files should not get pinned to the filesystem. */ 716 + if (!S_ISREG(mode)) 717 + return false; 718 + 719 + /* Setuid files should not get pinned to the filesystem. */ 720 + if (mode & S_ISUID) 721 + return false; 722 + 723 + /* Executable setgid files should not get pinned to the filesystem. */ 724 + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) 725 + return false; 726 + 727 + /* Hardlinking to unreadable or unwritable sources is dangerous. */ 728 + if (inode_permission(inode, MAY_READ | MAY_WRITE)) 729 + return false; 730 + 731 + return true; 732 + } 733 + 734 + /** 735 + * may_linkat - Check permissions for creating a hardlink 736 + * @link: the source to hardlink from 737 + * 738 + * Block hardlink when all of: 739 + * - sysctl_protected_hardlinks enabled 740 + * - fsuid does not match inode 741 + * - hardlink source is unsafe (see safe_hardlink_source() above) 742 + * - not CAP_FOWNER 743 + * 744 + * Returns 0 if successful, -ve on error. 745 + */ 746 + static int may_linkat(struct path *link) 747 + { 748 + const struct cred *cred; 749 + struct inode *inode; 750 + 751 + if (!sysctl_protected_hardlinks) 752 + return 0; 753 + 754 + cred = current_cred(); 755 + inode = link->dentry->d_inode; 756 + 757 + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, 758 + * otherwise, it must be a safe source. 759 + */ 760 + if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || 761 + capable(CAP_FOWNER)) 762 + return 0; 763 + 764 + audit_log_link_denied("linkat", link); 765 + return -EPERM; 766 + } 767 + 653 768 static __always_inline int 654 769 follow_link(struct path *link, struct nameidata *nd, void **p) 655 770 { ··· 1933 1818 while (err > 0) { 1934 1819 void *cookie; 1935 1820 struct path link = path; 1821 + err = may_follow_link(&link, nd); 1822 + if (unlikely(err)) 1823 + break; 1936 1824 nd->flags |= LOOKUP_PARENT; 1937 1825 err = follow_link(&link, nd, &cookie); 1938 1826 if (err) ··· 2395 2277 static int atomic_open(struct nameidata *nd, struct dentry *dentry, 2396 2278 struct path *path, struct file *file, 2397 2279 const struct open_flags *op, 2398 - bool *want_write, bool need_lookup, 2280 + bool got_write, bool need_lookup, 2399 2281 int *opened) 2400 2282 { 2401 2283 struct inode *dir = nd->path.dentry->d_inode; ··· 2418 2300 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) 2419 2301 mode &= ~current_umask(); 2420 2302 2421 - if (open_flag & O_EXCL) { 2303 + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { 2422 2304 open_flag &= ~O_TRUNC; 2423 2305 *opened |= FILE_CREATED; 2424 2306 } ··· 2432 2314 * Another problem is returing the "right" error value (e.g. for an 2433 2315 * O_EXCL open we want to return EEXIST not EROFS). 2434 2316 */ 2435 - if ((open_flag & (O_CREAT | O_TRUNC)) || 2436 - (open_flag & O_ACCMODE) != O_RDONLY) { 2437 - error = mnt_want_write(nd->path.mnt); 2438 - if (!error) { 2439 - *want_write = true; 2440 - } else if (!(open_flag & O_CREAT)) { 2317 + if (((open_flag & (O_CREAT | O_TRUNC)) || 2318 + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { 2319 + if (!(open_flag & O_CREAT)) { 2441 2320 /* 2442 2321 * No O_CREATE -> atomicity not a requirement -> fall 2443 2322 * back to lookup + open ··· 2442 2327 goto no_open; 2443 2328 } else if (open_flag & (O_EXCL | O_TRUNC)) { 2444 2329 /* Fall back and fail with the right error */ 2445 - create_error = error; 2330 + create_error = -EROFS; 2446 2331 goto no_open; 2447 2332 } else { 2448 2333 /* No side effects, safe to clear O_CREAT */ 2449 - create_error = error; 2334 + create_error = -EROFS; 2450 2335 open_flag &= ~O_CREAT; 2451 2336 } 2452 2337 } ··· 2553 2438 static int lookup_open(struct nameidata *nd, struct path *path, 2554 2439 struct file *file, 2555 2440 const struct open_flags *op, 2556 - bool *want_write, int *opened) 2441 + bool got_write, int *opened) 2557 2442 { 2558 2443 struct dentry *dir = nd->path.dentry; 2559 2444 struct inode *dir_inode = dir->d_inode; ··· 2571 2456 goto out_no_open; 2572 2457 2573 2458 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { 2574 - return atomic_open(nd, dentry, path, file, op, want_write, 2459 + return atomic_open(nd, dentry, path, file, op, got_write, 2575 2460 need_lookup, opened); 2576 2461 } 2577 2462 ··· 2595 2480 * a permanent write count is taken through 2596 2481 * the 'struct file' in finish_open(). 2597 2482 */ 2598 - error = mnt_want_write(nd->path.mnt); 2599 - if (error) 2483 + if (!got_write) { 2484 + error = -EROFS; 2600 2485 goto out_dput; 2601 - *want_write = true; 2486 + } 2602 2487 *opened |= FILE_CREATED; 2603 2488 error = security_path_mknod(&nd->path, dentry, mode, 0); 2604 2489 if (error) ··· 2628 2513 struct dentry *dir = nd->path.dentry; 2629 2514 int open_flag = op->open_flag; 2630 2515 bool will_truncate = (open_flag & O_TRUNC) != 0; 2631 - bool want_write = false; 2516 + bool got_write = false; 2632 2517 int acc_mode = op->acc_mode; 2633 2518 struct inode *inode; 2634 2519 bool symlink_ok = false; ··· 2697 2582 } 2698 2583 2699 2584 retry_lookup: 2585 + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { 2586 + error = mnt_want_write(nd->path.mnt); 2587 + if (!error) 2588 + got_write = true; 2589 + /* 2590 + * do _not_ fail yet - we might not need that or fail with 2591 + * a different error; let lookup_open() decide; we'll be 2592 + * dropping this one anyway. 2593 + */ 2594 + } 2700 2595 mutex_lock(&dir->d_inode->i_mutex); 2701 - error = lookup_open(nd, path, file, op, &want_write, opened); 2596 + error = lookup_open(nd, path, file, op, got_write, opened); 2702 2597 mutex_unlock(&dir->d_inode->i_mutex); 2703 2598 2704 2599 if (error <= 0) { ··· 2733 2608 } 2734 2609 2735 2610 /* 2736 - * It already exists. 2611 + * create/update audit record if it already exists. 2737 2612 */ 2738 - audit_inode(pathname, path->dentry); 2613 + if (path->dentry->d_inode) 2614 + audit_inode(pathname, path->dentry); 2739 2615 2740 2616 /* 2741 2617 * If atomic_open() acquired write access it is dropped now due to 2742 2618 * possible mount and symlink following (this might be optimized away if 2743 2619 * necessary...) 2744 2620 */ 2745 - if (want_write) { 2621 + if (got_write) { 2746 2622 mnt_drop_write(nd->path.mnt); 2747 - want_write = false; 2623 + got_write = false; 2748 2624 } 2749 2625 2750 2626 error = -EEXIST; 2751 - if (open_flag & O_EXCL) 2627 + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) 2752 2628 goto exit_dput; 2753 2629 2754 2630 error = follow_managed(path, nd->flags); ··· 2810 2684 error = mnt_want_write(nd->path.mnt); 2811 2685 if (error) 2812 2686 goto out; 2813 - want_write = true; 2687 + got_write = true; 2814 2688 } 2815 2689 finish_open_created: 2816 2690 error = may_open(&nd->path, acc_mode, open_flag); ··· 2837 2711 goto exit_fput; 2838 2712 } 2839 2713 out: 2840 - if (want_write) 2714 + if (got_write) 2841 2715 mnt_drop_write(nd->path.mnt); 2842 2716 path_put(&save_parent); 2843 2717 terminate_walk(nd); ··· 2861 2735 nd->inode = dir->d_inode; 2862 2736 save_parent.mnt = NULL; 2863 2737 save_parent.dentry = NULL; 2864 - if (want_write) { 2738 + if (got_write) { 2865 2739 mnt_drop_write(nd->path.mnt); 2866 - want_write = false; 2740 + got_write = false; 2867 2741 } 2868 2742 retried = true; 2869 2743 goto retry_lookup; ··· 2903 2777 error = -ELOOP; 2904 2778 break; 2905 2779 } 2780 + error = may_follow_link(&link, nd); 2781 + if (unlikely(error)) 2782 + break; 2906 2783 nd->flags |= LOOKUP_PARENT; 2907 2784 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2908 2785 error = follow_link(&link, nd, &cookie); ··· 2975 2846 { 2976 2847 struct dentry *dentry = ERR_PTR(-EEXIST); 2977 2848 struct nameidata nd; 2849 + int err2; 2978 2850 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2979 2851 if (error) 2980 2852 return ERR_PTR(error); ··· 2989 2859 nd.flags &= ~LOOKUP_PARENT; 2990 2860 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2991 2861 2862 + /* don't fail immediately if it's r/o, at least try to report other errors */ 2863 + err2 = mnt_want_write(nd.path.mnt); 2992 2864 /* 2993 2865 * Do the final lookup. 2994 2866 */ 2995 2867 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2996 2868 dentry = lookup_hash(&nd); 2997 2869 if (IS_ERR(dentry)) 2998 - goto fail; 2870 + goto unlock; 2999 2871 2872 + error = -EEXIST; 3000 2873 if (dentry->d_inode) 3001 - goto eexist; 2874 + goto fail; 3002 2875 /* 3003 2876 * Special case - lookup gave negative, but... we had foo/bar/ 3004 2877 * From the vfs_mknod() POV we just have a negative dentry - ··· 3009 2876 * been asking for (non-existent) directory. -ENOENT for you. 3010 2877 */ 3011 2878 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 3012 - dput(dentry); 3013 - dentry = ERR_PTR(-ENOENT); 2879 + error = -ENOENT; 2880 + goto fail; 2881 + } 2882 + if (unlikely(err2)) { 2883 + error = err2; 3014 2884 goto fail; 3015 2885 } 3016 2886 *path = nd.path; 3017 2887 return dentry; 3018 - eexist: 3019 - dput(dentry); 3020 - dentry = ERR_PTR(-EEXIST); 3021 2888 fail: 2889 + dput(dentry); 2890 + dentry = ERR_PTR(error); 2891 + unlock: 3022 2892 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2893 + if (!err2) 2894 + mnt_drop_write(nd.path.mnt); 3023 2895 out: 3024 2896 path_put(&nd.path); 3025 2897 return dentry; 3026 2898 } 3027 2899 EXPORT_SYMBOL(kern_path_create); 2900 + 2901 + void done_path_create(struct path *path, struct dentry *dentry) 2902 + { 2903 + dput(dentry); 2904 + mutex_unlock(&path->dentry->d_inode->i_mutex); 2905 + mnt_drop_write(path->mnt); 2906 + path_put(path); 2907 + } 2908 + EXPORT_SYMBOL(done_path_create); 3028 2909 3029 2910 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3030 2911 { ··· 3103 2956 struct path path; 3104 2957 int error; 3105 2958 3106 - if (S_ISDIR(mode)) 3107 - return -EPERM; 2959 + error = may_mknod(mode); 2960 + if (error) 2961 + return error; 3108 2962 3109 2963 dentry = user_path_create(dfd, filename, &path, 0); 3110 2964 if (IS_ERR(dentry)) ··· 3113 2965 3114 2966 if (!IS_POSIXACL(path.dentry->d_inode)) 3115 2967 mode &= ~current_umask(); 3116 - error = may_mknod(mode); 3117 - if (error) 3118 - goto out_dput; 3119 - error = mnt_want_write(path.mnt); 3120 - if (error) 3121 - goto out_dput; 3122 2968 error = security_path_mknod(&path, dentry, mode, dev); 3123 2969 if (error) 3124 - goto out_drop_write; 2970 + goto out; 3125 2971 switch (mode & S_IFMT) { 3126 2972 case 0: case S_IFREG: 3127 2973 error = vfs_create(path.dentry->d_inode,dentry,mode,true); ··· 3128 2986 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 3129 2987 break; 3130 2988 } 3131 - out_drop_write: 3132 - mnt_drop_write(path.mnt); 3133 - out_dput: 3134 - dput(dentry); 3135 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3136 - path_put(&path); 3137 - 2989 + out: 2990 + done_path_create(&path, dentry); 3138 2991 return error; 3139 2992 } 3140 2993 ··· 3175 3038 3176 3039 if (!IS_POSIXACL(path.dentry->d_inode)) 3177 3040 mode &= ~current_umask(); 3178 - error = mnt_want_write(path.mnt); 3179 - if (error) 3180 - goto out_dput; 3181 3041 error = security_path_mkdir(&path, dentry, mode); 3182 - if (error) 3183 - goto out_drop_write; 3184 - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3185 - out_drop_write: 3186 - mnt_drop_write(path.mnt); 3187 - out_dput: 3188 - dput(dentry); 3189 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3190 - path_put(&path); 3042 + if (!error) 3043 + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3044 + done_path_create(&path, dentry); 3191 3045 return error; 3192 3046 } 3193 3047 ··· 3272 3144 } 3273 3145 3274 3146 nd.flags &= ~LOOKUP_PARENT; 3147 + error = mnt_want_write(nd.path.mnt); 3148 + if (error) 3149 + goto exit1; 3275 3150 3276 3151 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3277 3152 dentry = lookup_hash(&nd); ··· 3285 3154 error = -ENOENT; 3286 3155 goto exit3; 3287 3156 } 3288 - error = mnt_want_write(nd.path.mnt); 3289 - if (error) 3290 - goto exit3; 3291 3157 error = security_path_rmdir(&nd.path, dentry); 3292 3158 if (error) 3293 - goto exit4; 3159 + goto exit3; 3294 3160 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 3295 - exit4: 3296 - mnt_drop_write(nd.path.mnt); 3297 3161 exit3: 3298 3162 dput(dentry); 3299 3163 exit2: 3300 3164 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3165 + mnt_drop_write(nd.path.mnt); 3301 3166 exit1: 3302 3167 path_put(&nd.path); 3303 3168 putname(name); ··· 3360 3233 goto exit1; 3361 3234 3362 3235 nd.flags &= ~LOOKUP_PARENT; 3236 + error = mnt_want_write(nd.path.mnt); 3237 + if (error) 3238 + goto exit1; 3363 3239 3364 3240 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3365 3241 dentry = lookup_hash(&nd); ··· 3375 3245 if (!inode) 3376 3246 goto slashes; 3377 3247 ihold(inode); 3378 - error = mnt_want_write(nd.path.mnt); 3379 - if (error) 3380 - goto exit2; 3381 3248 error = security_path_unlink(&nd.path, dentry); 3382 3249 if (error) 3383 - goto exit3; 3250 + goto exit2; 3384 3251 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 3385 - exit3: 3386 - mnt_drop_write(nd.path.mnt); 3387 - exit2: 3252 + exit2: 3388 3253 dput(dentry); 3389 3254 } 3390 3255 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3391 3256 if (inode) 3392 3257 iput(inode); /* truncate the inode here */ 3258 + mnt_drop_write(nd.path.mnt); 3393 3259 exit1: 3394 3260 path_put(&nd.path); 3395 3261 putname(name); ··· 3450 3324 if (IS_ERR(dentry)) 3451 3325 goto out_putname; 3452 3326 3453 - error = mnt_want_write(path.mnt); 3454 - if (error) 3455 - goto out_dput; 3456 3327 error = security_path_symlink(&path, dentry, from); 3457 - if (error) 3458 - goto out_drop_write; 3459 - error = vfs_symlink(path.dentry->d_inode, dentry, from); 3460 - out_drop_write: 3461 - mnt_drop_write(path.mnt); 3462 - out_dput: 3463 - dput(dentry); 3464 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3465 - path_put(&path); 3328 + if (!error) 3329 + error = vfs_symlink(path.dentry->d_inode, dentry, from); 3330 + done_path_create(&path, dentry); 3466 3331 out_putname: 3467 3332 putname(from); 3468 3333 return error; ··· 3553 3436 error = -EXDEV; 3554 3437 if (old_path.mnt != new_path.mnt) 3555 3438 goto out_dput; 3556 - error = mnt_want_write(new_path.mnt); 3557 - if (error) 3439 + error = may_linkat(&old_path); 3440 + if (unlikely(error)) 3558 3441 goto out_dput; 3559 3442 error = security_path_link(old_path.dentry, &new_path, new_dentry); 3560 3443 if (error) 3561 - goto out_drop_write; 3444 + goto out_dput; 3562 3445 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3563 - out_drop_write: 3564 - mnt_drop_write(new_path.mnt); 3565 3446 out_dput: 3566 - dput(new_dentry); 3567 - mutex_unlock(&new_path.dentry->d_inode->i_mutex); 3568 - path_put(&new_path); 3447 + done_path_create(&new_path, new_dentry); 3569 3448 out: 3570 3449 path_put(&old_path); 3571 3450 ··· 3757 3644 if (newnd.last_type != LAST_NORM) 3758 3645 goto exit2; 3759 3646 3647 + error = mnt_want_write(oldnd.path.mnt); 3648 + if (error) 3649 + goto exit2; 3650 + 3760 3651 oldnd.flags &= ~LOOKUP_PARENT; 3761 3652 newnd.flags &= ~LOOKUP_PARENT; 3762 3653 newnd.flags |= LOOKUP_RENAME_TARGET; ··· 3796 3679 if (new_dentry == trap) 3797 3680 goto exit5; 3798 3681 3799 - error = mnt_want_write(oldnd.path.mnt); 3800 - if (error) 3801 - goto exit5; 3802 3682 error = security_path_rename(&oldnd.path, old_dentry, 3803 3683 &newnd.path, new_dentry); 3804 3684 if (error) 3805 - goto exit6; 3685 + goto exit5; 3806 3686 error = vfs_rename(old_dir->d_inode, old_dentry, 3807 3687 new_dir->d_inode, new_dentry); 3808 - exit6: 3809 - mnt_drop_write(oldnd.path.mnt); 3810 3688 exit5: 3811 3689 dput(new_dentry); 3812 3690 exit4: 3813 3691 dput(old_dentry); 3814 3692 exit3: 3815 3693 unlock_rename(new_dir, old_dir); 3694 + mnt_drop_write(oldnd.path.mnt); 3816 3695 exit2: 3817 3696 path_put(&newnd.path); 3818 3697 putname(to);
+78 -21
fs/namespace.c
··· 283 283 } 284 284 285 285 /* 286 - * Most r/o checks on a fs are for operations that take 287 - * discrete amounts of time, like a write() or unlink(). 288 - * We must keep track of when those operations start 289 - * (for permission checks) and when they end, so that 290 - * we can determine when writes are able to occur to 291 - * a filesystem. 286 + * Most r/o & frozen checks on a fs are for operations that take discrete 287 + * amounts of time, like a write() or unlink(). We must keep track of when 288 + * those operations start (for permission checks) and when they end, so that we 289 + * can determine when writes are able to occur to a filesystem. 292 290 */ 293 291 /** 294 - * mnt_want_write - get write access to a mount 292 + * __mnt_want_write - get write access to a mount without freeze protection 295 293 * @m: the mount on which to take a write 296 294 * 297 - * This tells the low-level filesystem that a write is 298 - * about to be performed to it, and makes sure that 299 - * writes are allowed before returning success. When 300 - * the write operation is finished, mnt_drop_write() 301 - * must be called. This is effectively a refcount. 295 + * This tells the low-level filesystem that a write is about to be performed to 296 + * it, and makes sure that writes are allowed (mnt it read-write) before 297 + * returning success. This operation does not protect against filesystem being 298 + * frozen. When the write operation is finished, __mnt_drop_write() must be 299 + * called. This is effectively a refcount. 302 300 */ 303 - int mnt_want_write(struct vfsmount *m) 301 + int __mnt_want_write(struct vfsmount *m) 304 302 { 305 303 struct mount *mnt = real_mount(m); 306 304 int ret = 0; ··· 324 326 ret = -EROFS; 325 327 } 326 328 preempt_enable(); 329 + 330 + return ret; 331 + } 332 + 333 + /** 334 + * mnt_want_write - get write access to a mount 335 + * @m: the mount on which to take a write 336 + * 337 + * This tells the low-level filesystem that a write is about to be performed to 338 + * it, and makes sure that writes are allowed (mount is read-write, filesystem 339 + * is not frozen) before returning success. When the write operation is 340 + * finished, mnt_drop_write() must be called. This is effectively a refcount. 341 + */ 342 + int mnt_want_write(struct vfsmount *m) 343 + { 344 + int ret; 345 + 346 + sb_start_write(m->mnt_sb); 347 + ret = __mnt_want_write(m); 348 + if (ret) 349 + sb_end_write(m->mnt_sb); 327 350 return ret; 328 351 } 329 352 EXPORT_SYMBOL_GPL(mnt_want_write); ··· 374 355 EXPORT_SYMBOL_GPL(mnt_clone_write); 375 356 376 357 /** 358 + * __mnt_want_write_file - get write access to a file's mount 359 + * @file: the file who's mount on which to take a write 360 + * 361 + * This is like __mnt_want_write, but it takes a file and can 362 + * do some optimisations if the file is open for write already 363 + */ 364 + int __mnt_want_write_file(struct file *file) 365 + { 366 + struct inode *inode = file->f_dentry->d_inode; 367 + 368 + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 369 + return __mnt_want_write(file->f_path.mnt); 370 + else 371 + return mnt_clone_write(file->f_path.mnt); 372 + } 373 + 374 + /** 377 375 * mnt_want_write_file - get write access to a file's mount 378 376 * @file: the file who's mount on which to take a write 379 377 * ··· 399 363 */ 400 364 int mnt_want_write_file(struct file *file) 401 365 { 402 - struct inode *inode = file->f_dentry->d_inode; 403 - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 404 - return mnt_want_write(file->f_path.mnt); 405 - else 406 - return mnt_clone_write(file->f_path.mnt); 366 + int ret; 367 + 368 + sb_start_write(file->f_path.mnt->mnt_sb); 369 + ret = __mnt_want_write_file(file); 370 + if (ret) 371 + sb_end_write(file->f_path.mnt->mnt_sb); 372 + return ret; 407 373 } 408 374 EXPORT_SYMBOL_GPL(mnt_want_write_file); 409 375 410 376 /** 411 - * mnt_drop_write - give up write access to a mount 377 + * __mnt_drop_write - give up write access to a mount 412 378 * @mnt: the mount on which to give up write access 413 379 * 414 380 * Tells the low-level filesystem that we are done 415 381 * performing writes to it. Must be matched with 416 - * mnt_want_write() call above. 382 + * __mnt_want_write() call above. 417 383 */ 418 - void mnt_drop_write(struct vfsmount *mnt) 384 + void __mnt_drop_write(struct vfsmount *mnt) 419 385 { 420 386 preempt_disable(); 421 387 mnt_dec_writers(real_mount(mnt)); 422 388 preempt_enable(); 423 389 } 390 + 391 + /** 392 + * mnt_drop_write - give up write access to a mount 393 + * @mnt: the mount on which to give up write access 394 + * 395 + * Tells the low-level filesystem that we are done performing writes to it and 396 + * also allows filesystem to be frozen again. Must be matched with 397 + * mnt_want_write() call above. 398 + */ 399 + void mnt_drop_write(struct vfsmount *mnt) 400 + { 401 + __mnt_drop_write(mnt); 402 + sb_end_write(mnt->mnt_sb); 403 + } 424 404 EXPORT_SYMBOL_GPL(mnt_drop_write); 405 + 406 + void __mnt_drop_write_file(struct file *file) 407 + { 408 + __mnt_drop_write(file->f_path.mnt); 409 + } 425 410 426 411 void mnt_drop_write_file(struct file *file) 427 412 {
+5 -4
fs/nfsd/nfs4recover.c
··· 154 154 if (status < 0) 155 155 return; 156 156 157 + status = mnt_want_write_file(rec_file); 158 + if (status) 159 + return; 160 + 157 161 dir = rec_file->f_path.dentry; 158 162 /* lock the parent */ 159 163 mutex_lock(&dir->d_inode->i_mutex); ··· 177 173 * as well be forgiving and just succeed silently. 178 174 */ 179 175 goto out_put; 180 - status = mnt_want_write_file(rec_file); 181 - if (status) 182 - goto out_put; 183 176 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); 184 - mnt_drop_write_file(rec_file); 185 177 out_put: 186 178 dput(dentry); 187 179 out_unlock: ··· 189 189 " (err %d); please check that %s exists" 190 190 " and is writeable", status, 191 191 user_recovery_dirname); 192 + mnt_drop_write_file(rec_file); 192 193 nfs4_reset_creds(original_cred); 193 194 } 194 195
+1
fs/nfsd/nfsfh.c
··· 635 635 fhp->fh_post_saved = 0; 636 636 #endif 637 637 } 638 + fh_drop_write(fhp); 638 639 if (exp) { 639 640 exp_put(exp); 640 641 fhp->fh_export = NULL;
+8 -1
fs/nfsd/nfsproc.c
··· 196 196 struct dentry *dchild; 197 197 int type, mode; 198 198 __be32 nfserr; 199 + int hosterr; 199 200 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); 200 201 201 202 dprintk("nfsd: CREATE %s %.*s\n", ··· 215 214 nfserr = nfserr_exist; 216 215 if (isdotent(argp->name, argp->len)) 217 216 goto done; 217 + hosterr = fh_want_write(dirfhp); 218 + if (hosterr) { 219 + nfserr = nfserrno(hosterr); 220 + goto done; 221 + } 222 + 218 223 fh_lock_nested(dirfhp, I_MUTEX_PARENT); 219 224 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); 220 225 if (IS_ERR(dchild)) { ··· 337 330 out_unlock: 338 331 /* We don't really need to unlock, as fh_put does it. */ 339 332 fh_unlock(dirfhp); 340 - 333 + fh_drop_write(dirfhp); 341 334 done: 342 335 fh_put(dirfhp); 343 336 return nfsd_return_dirop(nfserr, resp);
+40 -39
fs/nfsd/vfs.c
··· 1284 1284 * If it has, the parent directory should already be locked. 1285 1285 */ 1286 1286 if (!resfhp->fh_dentry) { 1287 + host_err = fh_want_write(fhp); 1288 + if (host_err) 1289 + goto out_nfserr; 1290 + 1287 1291 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1288 1292 fh_lock_nested(fhp, I_MUTEX_PARENT); 1289 1293 dchild = lookup_one_len(fname, dentry, flen); ··· 1331 1327 goto out; 1332 1328 } 1333 1329 1334 - host_err = fh_want_write(fhp); 1335 - if (host_err) 1336 - goto out_nfserr; 1337 - 1338 1330 /* 1339 1331 * Get the dir op function pointer. 1340 1332 */ 1341 1333 err = 0; 1334 + host_err = 0; 1342 1335 switch (type) { 1343 1336 case S_IFREG: 1344 1337 host_err = vfs_create(dirp, dchild, iap->ia_mode, true); ··· 1352 1351 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1353 1352 break; 1354 1353 } 1355 - if (host_err < 0) { 1356 - fh_drop_write(fhp); 1354 + if (host_err < 0) 1357 1355 goto out_nfserr; 1358 - } 1359 1356 1360 1357 err = nfsd_create_setattr(rqstp, resfhp, iap); 1361 1358 ··· 1365 1366 err2 = nfserrno(commit_metadata(fhp)); 1366 1367 if (err2) 1367 1368 err = err2; 1368 - fh_drop_write(fhp); 1369 1369 /* 1370 1370 * Update the file handle to get the new inode info. 1371 1371 */ ··· 1423 1425 err = nfserr_notdir; 1424 1426 if (!dirp->i_op->lookup) 1425 1427 goto out; 1428 + 1429 + host_err = fh_want_write(fhp); 1430 + if (host_err) 1431 + goto out_nfserr; 1432 + 1426 1433 fh_lock_nested(fhp, I_MUTEX_PARENT); 1427 1434 1428 1435 /* ··· 1460 1457 v_atime = verifier[1]&0x7fffffff; 1461 1458 } 1462 1459 1463 - host_err = fh_want_write(fhp); 1464 - if (host_err) 1465 - goto out_nfserr; 1466 1460 if (dchild->d_inode) { 1467 1461 err = 0; 1468 1462 ··· 1530 1530 if (!err) 1531 1531 err = nfserrno(commit_metadata(fhp)); 1532 1532 1533 - fh_drop_write(fhp); 1534 1533 /* 1535 1534 * Update the filehandle to get the new inode info. 1536 1535 */ ··· 1540 1541 fh_unlock(fhp); 1541 1542 if (dchild && !IS_ERR(dchild)) 1542 1543 dput(dchild); 1544 + fh_drop_write(fhp); 1543 1545 return err; 1544 1546 1545 1547 out_nfserr: ··· 1621 1621 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1622 1622 if (err) 1623 1623 goto out; 1624 + 1625 + host_err = fh_want_write(fhp); 1626 + if (host_err) 1627 + goto out_nfserr; 1628 + 1624 1629 fh_lock(fhp); 1625 1630 dentry = fhp->fh_dentry; 1626 1631 dnew = lookup_one_len(fname, dentry, flen); 1627 1632 host_err = PTR_ERR(dnew); 1628 1633 if (IS_ERR(dnew)) 1629 - goto out_nfserr; 1630 - 1631 - host_err = fh_want_write(fhp); 1632 - if (host_err) 1633 1634 goto out_nfserr; 1634 1635 1635 1636 if (unlikely(path[plen] != 0)) { ··· 1692 1691 if (isdotent(name, len)) 1693 1692 goto out; 1694 1693 1694 + host_err = fh_want_write(tfhp); 1695 + if (host_err) { 1696 + err = nfserrno(host_err); 1697 + goto out; 1698 + } 1699 + 1695 1700 fh_lock_nested(ffhp, I_MUTEX_PARENT); 1696 1701 ddir = ffhp->fh_dentry; 1697 1702 dirp = ddir->d_inode; ··· 1709 1702 1710 1703 dold = tfhp->fh_dentry; 1711 1704 1712 - host_err = fh_want_write(tfhp); 1713 - if (host_err) { 1714 - err = nfserrno(host_err); 1715 - goto out_dput; 1716 - } 1717 1705 err = nfserr_noent; 1718 1706 if (!dold->d_inode) 1719 - goto out_drop_write; 1707 + goto out_dput; 1720 1708 host_err = nfsd_break_lease(dold->d_inode); 1721 1709 if (host_err) { 1722 1710 err = nfserrno(host_err); 1723 - goto out_drop_write; 1711 + goto out_dput; 1724 1712 } 1725 1713 host_err = vfs_link(dold, dirp, dnew); 1726 1714 if (!host_err) { ··· 1728 1726 else 1729 1727 err = nfserrno(host_err); 1730 1728 } 1731 - out_drop_write: 1732 - fh_drop_write(tfhp); 1733 1729 out_dput: 1734 1730 dput(dnew); 1735 1731 out_unlock: 1736 1732 fh_unlock(ffhp); 1733 + fh_drop_write(tfhp); 1737 1734 out: 1738 1735 return err; 1739 1736 ··· 1775 1774 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) 1776 1775 goto out; 1777 1776 1777 + host_err = fh_want_write(ffhp); 1778 + if (host_err) { 1779 + err = nfserrno(host_err); 1780 + goto out; 1781 + } 1782 + 1778 1783 /* cannot use fh_lock as we need deadlock protective ordering 1779 1784 * so do it by hand */ 1780 1785 trap = lock_rename(tdentry, fdentry); ··· 1811 1804 host_err = -EXDEV; 1812 1805 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1813 1806 goto out_dput_new; 1814 - host_err = fh_want_write(ffhp); 1815 - if (host_err) 1816 - goto out_dput_new; 1817 1807 1818 1808 host_err = nfsd_break_lease(odentry->d_inode); 1819 1809 if (host_err) 1820 - goto out_drop_write; 1810 + goto out_dput_new; 1821 1811 if (ndentry->d_inode) { 1822 1812 host_err = nfsd_break_lease(ndentry->d_inode); 1823 1813 if (host_err) 1824 - goto out_drop_write; 1814 + goto out_dput_new; 1825 1815 } 1826 1816 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1827 1817 if (!host_err) { ··· 1826 1822 if (!host_err) 1827 1823 host_err = commit_metadata(ffhp); 1828 1824 } 1829 - out_drop_write: 1830 - fh_drop_write(ffhp); 1831 1825 out_dput_new: 1832 1826 dput(ndentry); 1833 1827 out_dput_old: ··· 1841 1839 fill_post_wcc(tfhp); 1842 1840 unlock_rename(tdentry, fdentry); 1843 1841 ffhp->fh_locked = tfhp->fh_locked = 0; 1842 + fh_drop_write(ffhp); 1844 1843 1845 1844 out: 1846 1845 return err; ··· 1867 1864 if (err) 1868 1865 goto out; 1869 1866 1867 + host_err = fh_want_write(fhp); 1868 + if (host_err) 1869 + goto out_nfserr; 1870 + 1870 1871 fh_lock_nested(fhp, I_MUTEX_PARENT); 1871 1872 dentry = fhp->fh_dentry; 1872 1873 dirp = dentry->d_inode; ··· 1889 1882 if (!type) 1890 1883 type = rdentry->d_inode->i_mode & S_IFMT; 1891 1884 1892 - host_err = fh_want_write(fhp); 1893 - if (host_err) 1894 - goto out_put; 1895 - 1896 1885 host_err = nfsd_break_lease(rdentry->d_inode); 1897 1886 if (host_err) 1898 - goto out_drop_write; 1887 + goto out_put; 1899 1888 if (type != S_IFDIR) 1900 1889 host_err = vfs_unlink(dirp, rdentry); 1901 1890 else 1902 1891 host_err = vfs_rmdir(dirp, rdentry); 1903 1892 if (!host_err) 1904 1893 host_err = commit_metadata(fhp); 1905 - out_drop_write: 1906 - fh_drop_write(fhp); 1907 1894 out_put: 1908 1895 dput(rdentry); 1909 1896
+9 -2
fs/nfsd/vfs.h
··· 110 110 111 111 static inline int fh_want_write(struct svc_fh *fh) 112 112 { 113 - return mnt_want_write(fh->fh_export->ex_path.mnt); 113 + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); 114 + 115 + if (!ret) 116 + fh->fh_want_write = 1; 117 + return ret; 114 118 } 115 119 116 120 static inline void fh_drop_write(struct svc_fh *fh) 117 121 { 118 - mnt_drop_write(fh->fh_export->ex_path.mnt); 122 + if (fh->fh_want_write) { 123 + fh->fh_want_write = 0; 124 + mnt_drop_write(fh->fh_export->ex_path.mnt); 125 + } 119 126 } 120 127 121 128 #endif /* LINUX_NFSD_VFS_H */
+11 -7
fs/nilfs2/file.c
··· 69 69 struct page *page = vmf->page; 70 70 struct inode *inode = vma->vm_file->f_dentry->d_inode; 71 71 struct nilfs_transaction_info ti; 72 - int ret; 72 + int ret = 0; 73 73 74 74 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) 75 75 return VM_FAULT_SIGBUS; /* -ENOSPC */ 76 76 77 + sb_start_pagefault(inode->i_sb); 77 78 lock_page(page); 78 79 if (page->mapping != inode->i_mapping || 79 80 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { 80 81 unlock_page(page); 81 - return VM_FAULT_NOPAGE; /* make the VM retry the fault */ 82 + ret = -EFAULT; /* make the VM retry the fault */ 83 + goto out; 82 84 } 83 85 84 86 /* ··· 114 112 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); 115 113 /* never returns -ENOMEM, but may return -ENOSPC */ 116 114 if (unlikely(ret)) 117 - return VM_FAULT_SIGBUS; 115 + goto out; 118 116 119 - ret = block_page_mkwrite(vma, vmf, nilfs_get_block); 120 - if (ret != VM_FAULT_LOCKED) { 117 + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); 118 + if (ret) { 121 119 nilfs_transaction_abort(inode->i_sb); 122 - return ret; 120 + goto out; 123 121 } 124 122 nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); 125 123 nilfs_transaction_commit(inode->i_sb); 126 124 127 125 mapped: 128 126 wait_on_page_writeback(page); 129 - return VM_FAULT_LOCKED; 127 + out: 128 + sb_end_pagefault(inode->i_sb); 129 + return block_page_mkwrite_return(ret); 130 130 } 131 131 132 132 static const struct vm_operations_struct nilfs_file_vm_ops = {
-2
fs/nilfs2/ioctl.c
··· 660 660 goto out_free; 661 661 } 662 662 663 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 664 - 665 663 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); 666 664 if (ret < 0) 667 665 printk(KERN_ERR "NILFS: GC failed during preparation: "
+4 -1
fs/nilfs2/segment.c
··· 189 189 if (ret > 0) 190 190 return 0; 191 191 192 - vfs_check_frozen(sb, SB_FREEZE_WRITE); 192 + sb_start_intwrite(sb); 193 193 194 194 nilfs = sb->s_fs_info; 195 195 down_read(&nilfs->ns_segctor_sem); ··· 205 205 current->journal_info = ti->ti_save; 206 206 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 207 207 kmem_cache_free(nilfs_transaction_cachep, ti); 208 + sb_end_intwrite(sb); 208 209 return ret; 209 210 } 210 211 ··· 247 246 err = nilfs_construct_segment(sb); 248 247 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 249 248 kmem_cache_free(nilfs_transaction_cachep, ti); 249 + sb_end_intwrite(sb); 250 250 return err; 251 251 } 252 252 ··· 266 264 current->journal_info = ti->ti_save; 267 265 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 268 266 kmem_cache_free(nilfs_transaction_cachep, ti); 267 + sb_end_intwrite(sb); 269 268 } 270 269 271 270 void nilfs_relax_pressure_in_lock(struct super_block *sb)
+2 -1
fs/ntfs/file.c
··· 2084 2084 if (err) 2085 2085 return err; 2086 2086 pos = *ppos; 2087 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2088 2087 /* We can write back this queue in page reclaim. */ 2089 2088 current->backing_dev_info = mapping->backing_dev_info; 2090 2089 written = 0; ··· 2118 2119 2119 2120 BUG_ON(iocb->ki_pos != pos); 2120 2121 2122 + sb_start_write(inode->i_sb); 2121 2123 mutex_lock(&inode->i_mutex); 2122 2124 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2123 2125 mutex_unlock(&inode->i_mutex); ··· 2127 2127 if (err < 0) 2128 2128 ret = err; 2129 2129 } 2130 + sb_end_write(inode->i_sb); 2130 2131 return ret; 2131 2132 } 2132 2133
+9 -2
fs/ocfs2/file.c
··· 1971 1971 { 1972 1972 struct inode *inode = file->f_path.dentry->d_inode; 1973 1973 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1974 + int ret; 1974 1975 1975 1976 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1976 1977 !ocfs2_writes_unwritten_extents(osb)) ··· 1986 1985 if (!(file->f_mode & FMODE_WRITE)) 1987 1986 return -EBADF; 1988 1987 1989 - return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1988 + ret = mnt_want_write_file(file); 1989 + if (ret) 1990 + return ret; 1991 + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1992 + mnt_drop_write_file(file); 1993 + return ret; 1990 1994 } 1991 1995 1992 1996 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, ··· 2267 2261 if (iocb->ki_left == 0) 2268 2262 return 0; 2269 2263 2270 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2264 + sb_start_write(inode->i_sb); 2271 2265 2272 2266 appending = file->f_flags & O_APPEND ? 1 : 0; 2273 2267 direct_io = file->f_flags & O_DIRECT ? 1 : 0; ··· 2442 2436 ocfs2_iocb_clear_sem_locked(iocb); 2443 2437 2444 2438 mutex_unlock(&inode->i_mutex); 2439 + sb_end_write(inode->i_sb); 2445 2440 2446 2441 if (written) 2447 2442 ret = written;
+12 -2
fs/ocfs2/ioctl.c
··· 928 928 if (get_user(new_clusters, (int __user *)arg)) 929 929 return -EFAULT; 930 930 931 - return ocfs2_group_extend(inode, new_clusters); 931 + status = mnt_want_write_file(filp); 932 + if (status) 933 + return status; 934 + status = ocfs2_group_extend(inode, new_clusters); 935 + mnt_drop_write_file(filp); 936 + return status; 932 937 case OCFS2_IOC_GROUP_ADD: 933 938 case OCFS2_IOC_GROUP_ADD64: 934 939 if (!capable(CAP_SYS_RESOURCE)) ··· 942 937 if (copy_from_user(&input, (int __user *) arg, sizeof(input))) 943 938 return -EFAULT; 944 939 945 - return ocfs2_group_add(inode, &input); 940 + status = mnt_want_write_file(filp); 941 + if (status) 942 + return status; 943 + status = ocfs2_group_add(inode, &input); 944 + mnt_drop_write_file(filp); 945 + return status; 946 946 case OCFS2_IOC_REFLINK: 947 947 if (copy_from_user(&args, argp, sizeof(args))) 948 948 return -EFAULT;
+6 -1
fs/ocfs2/journal.c
··· 355 355 if (journal_current_handle()) 356 356 return jbd2_journal_start(journal, max_buffs); 357 357 358 + sb_start_intwrite(osb->sb); 359 + 358 360 down_read(&osb->journal->j_trans_barrier); 359 361 360 362 handle = jbd2_journal_start(journal, max_buffs); 361 363 if (IS_ERR(handle)) { 362 364 up_read(&osb->journal->j_trans_barrier); 365 + sb_end_intwrite(osb->sb); 363 366 364 367 mlog_errno(PTR_ERR(handle)); 365 368 ··· 391 388 if (ret < 0) 392 389 mlog_errno(ret); 393 390 394 - if (!nested) 391 + if (!nested) { 395 392 up_read(&journal->j_trans_barrier); 393 + sb_end_intwrite(osb->sb); 394 + } 396 395 397 396 return ret; 398 397 }
+2
fs/ocfs2/mmap.c
··· 136 136 sigset_t oldset; 137 137 int ret; 138 138 139 + sb_start_pagefault(inode->i_sb); 139 140 ocfs2_block_signals(&oldset); 140 141 141 142 /* ··· 166 165 167 166 out: 168 167 ocfs2_unblock_signals(&oldset); 168 + sb_end_pagefault(inode->i_sb); 169 169 return ret; 170 170 } 171 171
+1 -10
fs/ocfs2/refcounttree.c
··· 4466 4466 goto out_dput; 4467 4467 } 4468 4468 4469 - error = mnt_want_write(new_path.mnt); 4470 - if (error) { 4471 - mlog_errno(error); 4472 - goto out_dput; 4473 - } 4474 - 4475 4469 error = ocfs2_vfs_reflink(old_path.dentry, 4476 4470 new_path.dentry->d_inode, 4477 4471 new_dentry, preserve); 4478 - mnt_drop_write(new_path.mnt); 4479 4472 out_dput: 4480 - dput(new_dentry); 4481 - mutex_unlock(&new_path.dentry->d_inode->i_mutex); 4482 - path_put(&new_path); 4473 + done_path_create(&new_path, new_dentry); 4483 4474 out: 4484 4475 path_put(&old_path); 4485 4476
+9 -6
fs/open.c
··· 164 164 if (IS_APPEND(inode)) 165 165 goto out_putf; 166 166 167 + sb_start_write(inode->i_sb); 167 168 error = locks_verify_truncate(inode, file, length); 168 169 if (!error) 169 170 error = security_path_truncate(&file->f_path); 170 171 if (!error) 171 172 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 173 + sb_end_write(inode->i_sb); 172 174 out_putf: 173 175 fput(file); 174 176 out: ··· 268 266 if (!file->f_op->fallocate) 269 267 return -EOPNOTSUPP; 270 268 271 - return file->f_op->fallocate(file, mode, offset, len); 269 + sb_start_write(inode->i_sb); 270 + ret = file->f_op->fallocate(file, mode, offset, len); 271 + sb_end_write(inode->i_sb); 272 + return ret; 272 273 } 273 274 274 275 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) ··· 625 620 /* 626 621 * Balanced in __fput() 627 622 */ 628 - error = mnt_want_write(mnt); 623 + error = __mnt_want_write(mnt); 629 624 if (error) 630 625 put_write_access(inode); 631 626 } ··· 659 654 if (unlikely(f->f_flags & O_PATH)) 660 655 f->f_mode = FMODE_PATH; 661 656 657 + path_get(&f->f_path); 662 658 inode = f->f_path.dentry->d_inode; 663 659 if (f->f_mode & FMODE_WRITE) { 664 660 error = __get_file_write_access(inode, f->f_path.mnt); ··· 745 739 int error; 746 740 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ 747 741 748 - mntget(file->f_path.mnt); 749 - file->f_path.dentry = dget(dentry); 750 - 742 + file->f_path.dentry = dentry; 751 743 error = do_dentry_open(file, open, current_cred()); 752 744 if (!error) 753 745 *opened |= FILE_OPENED; ··· 788 784 789 785 f->f_flags = flags; 790 786 f->f_path = *path; 791 - path_get(&f->f_path); 792 787 error = do_dentry_open(f, NULL, cred); 793 788 if (!error) { 794 789 error = open_check_o_direct(f);
+26 -49
fs/pipe.c
··· 1016 1016 return NULL; 1017 1017 } 1018 1018 1019 - struct file *create_write_pipe(int flags) 1019 + int create_pipe_files(struct file **res, int flags) 1020 1020 { 1021 1021 int err; 1022 - struct inode *inode; 1022 + struct inode *inode = get_pipe_inode(); 1023 1023 struct file *f; 1024 1024 struct path path; 1025 - struct qstr name = { .name = "" }; 1025 + static struct qstr name = { .name = "" }; 1026 1026 1027 - err = -ENFILE; 1028 - inode = get_pipe_inode(); 1029 1027 if (!inode) 1030 - goto err; 1028 + return -ENFILE; 1031 1029 1032 1030 err = -ENOMEM; 1033 1031 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); ··· 1039 1041 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1040 1042 if (!f) 1041 1043 goto err_dentry; 1042 - f->f_mapping = inode->i_mapping; 1043 1044 1044 1045 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1045 - f->f_version = 0; 1046 1046 1047 - return f; 1047 + res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); 1048 + if (!res[0]) 1049 + goto err_file; 1048 1050 1049 - err_dentry: 1051 + path_get(&path); 1052 + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1053 + res[1] = f; 1054 + return 0; 1055 + 1056 + err_file: 1057 + put_filp(f); 1058 + err_dentry: 1050 1059 free_pipe_info(inode); 1051 1060 path_put(&path); 1052 - return ERR_PTR(err); 1061 + return err; 1053 1062 1054 - err_inode: 1063 + err_inode: 1055 1064 free_pipe_info(inode); 1056 1065 iput(inode); 1057 - err: 1058 - return ERR_PTR(err); 1059 - } 1060 - 1061 - void free_write_pipe(struct file *f) 1062 - { 1063 - free_pipe_info(f->f_dentry->d_inode); 1064 - path_put(&f->f_path); 1065 - put_filp(f); 1066 - } 1067 - 1068 - struct file *create_read_pipe(struct file *wrf, int flags) 1069 - { 1070 - /* Grab pipe from the writer */ 1071 - struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1072 - &read_pipefifo_fops); 1073 - if (!f) 1074 - return ERR_PTR(-ENFILE); 1075 - 1076 - path_get(&wrf->f_path); 1077 - f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1078 - 1079 - return f; 1066 + return err; 1080 1067 } 1081 1068 1082 1069 int do_pipe_flags(int *fd, int flags) 1083 1070 { 1084 - struct file *fw, *fr; 1071 + struct file *files[2]; 1085 1072 int error; 1086 1073 int fdw, fdr; 1087 1074 1088 1075 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1089 1076 return -EINVAL; 1090 1077 1091 - fw = create_write_pipe(flags); 1092 - if (IS_ERR(fw)) 1093 - return PTR_ERR(fw); 1094 - fr = create_read_pipe(fw, flags); 1095 - error = PTR_ERR(fr); 1096 - if (IS_ERR(fr)) 1097 - goto err_write_pipe; 1078 + error = create_pipe_files(files, flags); 1079 + if (error) 1080 + return error; 1098 1081 1099 1082 error = get_unused_fd_flags(flags); 1100 1083 if (error < 0) ··· 1088 1109 fdw = error; 1089 1110 1090 1111 audit_fd_pair(fdr, fdw); 1091 - fd_install(fdr, fr); 1092 - fd_install(fdw, fw); 1112 + fd_install(fdr, files[0]); 1113 + fd_install(fdw, files[1]); 1093 1114 fd[0] = fdr; 1094 1115 fd[1] = fdw; 1095 1116 ··· 1098 1119 err_fdr: 1099 1120 put_unused_fd(fdr); 1100 1121 err_read_pipe: 1101 - path_put(&fr->f_path); 1102 - put_filp(fr); 1103 - err_write_pipe: 1104 - free_write_pipe(fw); 1122 + fput(files[0]); 1123 + fput(files[1]); 1105 1124 return error; 1106 1125 } 1107 1126
+3
fs/splice.c
··· 996 996 }; 997 997 ssize_t ret; 998 998 999 + sb_start_write(inode->i_sb); 1000 + 999 1001 pipe_lock(pipe); 1000 1002 1001 1003 splice_from_pipe_begin(&sd); ··· 1036 1034 *ppos += ret; 1037 1035 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1038 1036 } 1037 + sb_end_write(inode->i_sb); 1039 1038 1040 1039 return ret; 1041 1040 }
+230 -22
fs/super.c
··· 33 33 #include <linux/rculist_bl.h> 34 34 #include <linux/cleancache.h> 35 35 #include <linux/fsnotify.h> 36 + #include <linux/lockdep.h> 36 37 #include "internal.h" 37 38 38 39 39 40 LIST_HEAD(super_blocks); 40 41 DEFINE_SPINLOCK(sb_lock); 42 + 43 + static char *sb_writers_name[SB_FREEZE_LEVELS] = { 44 + "sb_writers", 45 + "sb_pagefaults", 46 + "sb_internal", 47 + }; 41 48 42 49 /* 43 50 * One thing we have to be careful of with a per-sb shrinker is that we don't ··· 109 102 return total_objects; 110 103 } 111 104 105 + static int init_sb_writers(struct super_block *s, struct file_system_type *type) 106 + { 107 + int err; 108 + int i; 109 + 110 + for (i = 0; i < SB_FREEZE_LEVELS; i++) { 111 + err = percpu_counter_init(&s->s_writers.counter[i], 0); 112 + if (err < 0) 113 + goto err_out; 114 + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 115 + &type->s_writers_key[i], 0); 116 + } 117 + init_waitqueue_head(&s->s_writers.wait); 118 + init_waitqueue_head(&s->s_writers.wait_unfrozen); 119 + return 0; 120 + err_out: 121 + while (--i >= 0) 122 + percpu_counter_destroy(&s->s_writers.counter[i]); 123 + return err; 124 + } 125 + 126 + static void destroy_sb_writers(struct super_block *s) 127 + { 128 + int i; 129 + 130 + for (i = 0; i < SB_FREEZE_LEVELS; i++) 131 + percpu_counter_destroy(&s->s_writers.counter[i]); 132 + } 133 + 112 134 /** 113 135 * alloc_super - create new superblock 114 136 * @type: filesystem type superblock should belong to ··· 153 117 154 118 if (s) { 155 119 if (security_sb_alloc(s)) { 120 + /* 121 + * We cannot call security_sb_free() without 122 + * security_sb_alloc() succeeding. So bail out manually 123 + */ 156 124 kfree(s); 157 125 s = NULL; 158 126 goto out; 159 127 } 160 128 #ifdef CONFIG_SMP 161 129 s->s_files = alloc_percpu(struct list_head); 162 - if (!s->s_files) { 163 - security_sb_free(s); 164 - kfree(s); 165 - s = NULL; 166 - goto out; 167 - } else { 130 + if (!s->s_files) 131 + goto err_out; 132 + else { 168 133 int i; 169 134 170 135 for_each_possible_cpu(i) ··· 174 137 #else 175 138 INIT_LIST_HEAD(&s->s_files); 176 139 #endif 140 + if (init_sb_writers(s, type)) 141 + goto err_out; 177 142 s->s_flags = flags; 178 143 s->s_bdi = &default_backing_dev_info; 179 144 INIT_HLIST_NODE(&s->s_instances); ··· 217 178 mutex_init(&s->s_dquot.dqio_mutex); 218 179 mutex_init(&s->s_dquot.dqonoff_mutex); 219 180 init_rwsem(&s->s_dquot.dqptr_sem); 220 - init_waitqueue_head(&s->s_wait_unfrozen); 221 181 s->s_maxbytes = MAX_NON_LFS; 222 182 s->s_op = &default_op; 223 183 s->s_time_gran = 1000000000; ··· 228 190 } 229 191 out: 230 192 return s; 193 + err_out: 194 + security_sb_free(s); 195 + #ifdef CONFIG_SMP 196 + if (s->s_files) 197 + free_percpu(s->s_files); 198 + #endif 199 + destroy_sb_writers(s); 200 + kfree(s); 201 + s = NULL; 202 + goto out; 231 203 } 232 204 233 205 /** ··· 251 203 #ifdef CONFIG_SMP 252 204 free_percpu(s->s_files); 253 205 #endif 206 + destroy_sb_writers(s); 254 207 security_sb_free(s); 255 208 WARN_ON(!list_empty(&s->s_mounts)); 256 209 kfree(s->s_subtype); ··· 700 651 { 701 652 while (1) { 702 653 struct super_block *s = get_super(bdev); 703 - if (!s || s->s_frozen == SB_UNFROZEN) 654 + if (!s || s->s_writers.frozen == SB_UNFROZEN) 704 655 return s; 705 656 up_read(&s->s_umount); 706 - vfs_check_frozen(s, SB_FREEZE_WRITE); 657 + wait_event(s->s_writers.wait_unfrozen, 658 + s->s_writers.frozen == SB_UNFROZEN); 707 659 put_super(s); 708 660 } 709 661 } ··· 782 732 int retval; 783 733 int remount_ro; 784 734 785 - if (sb->s_frozen != SB_UNFROZEN) 735 + if (sb->s_writers.frozen != SB_UNFROZEN) 786 736 return -EBUSY; 787 737 788 738 #ifdef CONFIG_BLOCK ··· 1213 1163 return ERR_PTR(error); 1214 1164 } 1215 1165 1166 + /* 1167 + * This is an internal function, please use sb_end_{write,pagefault,intwrite} 1168 + * instead. 1169 + */ 1170 + void __sb_end_write(struct super_block *sb, int level) 1171 + { 1172 + percpu_counter_dec(&sb->s_writers.counter[level-1]); 1173 + /* 1174 + * Make sure s_writers are updated before we wake up waiters in 1175 + * freeze_super(). 1176 + */ 1177 + smp_mb(); 1178 + if (waitqueue_active(&sb->s_writers.wait)) 1179 + wake_up(&sb->s_writers.wait); 1180 + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); 1181 + } 1182 + EXPORT_SYMBOL(__sb_end_write); 1183 + 1184 + #ifdef CONFIG_LOCKDEP 1185 + /* 1186 + * We want lockdep to tell us about possible deadlocks with freezing but 1187 + * it's it bit tricky to properly instrument it. Getting a freeze protection 1188 + * works as getting a read lock but there are subtle problems. XFS for example 1189 + * gets freeze protection on internal level twice in some cases, which is OK 1190 + * only because we already hold a freeze protection also on higher level. Due 1191 + * to these cases we have to tell lockdep we are doing trylock when we 1192 + * already hold a freeze protection for a higher freeze level. 1193 + */ 1194 + static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, 1195 + unsigned long ip) 1196 + { 1197 + int i; 1198 + 1199 + if (!trylock) { 1200 + for (i = 0; i < level - 1; i++) 1201 + if (lock_is_held(&sb->s_writers.lock_map[i])) { 1202 + trylock = true; 1203 + break; 1204 + } 1205 + } 1206 + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); 1207 + } 1208 + #endif 1209 + 1210 + /* 1211 + * This is an internal function, please use sb_start_{write,pagefault,intwrite} 1212 + * instead. 1213 + */ 1214 + int __sb_start_write(struct super_block *sb, int level, bool wait) 1215 + { 1216 + retry: 1217 + if (unlikely(sb->s_writers.frozen >= level)) { 1218 + if (!wait) 1219 + return 0; 1220 + wait_event(sb->s_writers.wait_unfrozen, 1221 + sb->s_writers.frozen < level); 1222 + } 1223 + 1224 + #ifdef CONFIG_LOCKDEP 1225 + acquire_freeze_lock(sb, level, !wait, _RET_IP_); 1226 + #endif 1227 + percpu_counter_inc(&sb->s_writers.counter[level-1]); 1228 + /* 1229 + * Make sure counter is updated before we check for frozen. 1230 + * freeze_super() first sets frozen and then checks the counter. 1231 + */ 1232 + smp_mb(); 1233 + if (unlikely(sb->s_writers.frozen >= level)) { 1234 + __sb_end_write(sb, level); 1235 + goto retry; 1236 + } 1237 + return 1; 1238 + } 1239 + EXPORT_SYMBOL(__sb_start_write); 1240 + 1241 + /** 1242 + * sb_wait_write - wait until all writers to given file system finish 1243 + * @sb: the super for which we wait 1244 + * @level: type of writers we wait for (normal vs page fault) 1245 + * 1246 + * This function waits until there are no writers of given type to given file 1247 + * system. Caller of this function should make sure there can be no new writers 1248 + * of type @level before calling this function. Otherwise this function can 1249 + * livelock. 1250 + */ 1251 + static void sb_wait_write(struct super_block *sb, int level) 1252 + { 1253 + s64 writers; 1254 + 1255 + /* 1256 + * We just cycle-through lockdep here so that it does not complain 1257 + * about returning with lock to userspace 1258 + */ 1259 + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); 1260 + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); 1261 + 1262 + do { 1263 + DEFINE_WAIT(wait); 1264 + 1265 + /* 1266 + * We use a barrier in prepare_to_wait() to separate setting 1267 + * of frozen and checking of the counter 1268 + */ 1269 + prepare_to_wait(&sb->s_writers.wait, &wait, 1270 + TASK_UNINTERRUPTIBLE); 1271 + 1272 + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); 1273 + if (writers) 1274 + schedule(); 1275 + 1276 + finish_wait(&sb->s_writers.wait, &wait); 1277 + } while (writers); 1278 + } 1279 + 1216 1280 /** 1217 1281 * freeze_super - lock the filesystem and force it into a consistent state 1218 1282 * @sb: the super to lock ··· 1334 1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1335 1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1336 1172 * -EBUSY. 1173 + * 1174 + * During this function, sb->s_writers.frozen goes through these values: 1175 + * 1176 + * SB_UNFROZEN: File system is normal, all writes progress as usual. 1177 + * 1178 + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New 1179 + * writes should be blocked, though page faults are still allowed. We wait for 1180 + * all writes to complete and then proceed to the next stage. 1181 + * 1182 + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked 1183 + * but internal fs threads can still modify the filesystem (although they 1184 + * should not dirty new pages or inodes), writeback can run etc. After waiting 1185 + * for all running page faults we sync the filesystem which will clean all 1186 + * dirty pages and inodes (no new dirty pages or inodes can be created when 1187 + * sync is running). 1188 + * 1189 + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs 1190 + * modification are blocked (e.g. XFS preallocation truncation on inode 1191 + * reclaim). This is usually implemented by blocking new transactions for 1192 + * filesystems that have them and need this additional guard. After all 1193 + * internal writers are finished we call ->freeze_fs() to finish filesystem 1194 + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is 1195 + * mostly auxiliary for filesystems to verify they do not modify frozen fs. 1196 + * 1197 + * sb->s_writers.frozen is protected by sb->s_umount. 1337 1198 */ 1338 1199 int freeze_super(struct super_block *sb) 1339 1200 { ··· 1366 1177 1367 1178 atomic_inc(&sb->s_active); 1368 1179 down_write(&sb->s_umount); 1369 - if (sb->s_frozen) { 1180 + if (sb->s_writers.frozen != SB_UNFROZEN) { 1370 1181 deactivate_locked_super(sb); 1371 1182 return -EBUSY; 1372 1183 } ··· 1377 1188 } 1378 1189 1379 1190 if (sb->s_flags & MS_RDONLY) { 1380 - sb->s_frozen = SB_FREEZE_TRANS; 1381 - smp_wmb(); 1191 + /* Nothing to do really... */ 1192 + sb->s_writers.frozen = SB_FREEZE_COMPLETE; 1382 1193 up_write(&sb->s_umount); 1383 1194 return 0; 1384 1195 } 1385 1196 1386 - sb->s_frozen = SB_FREEZE_WRITE; 1197 + /* From now on, no new normal writers can start */ 1198 + sb->s_writers.frozen = SB_FREEZE_WRITE; 1387 1199 smp_wmb(); 1388 1200 1201 + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ 1202 + up_write(&sb->s_umount); 1203 + 1204 + sb_wait_write(sb, SB_FREEZE_WRITE); 1205 + 1206 + /* Now we go and block page faults... */ 1207 + down_write(&sb->s_umount); 1208 + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; 1209 + smp_wmb(); 1210 + 1211 + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); 1212 + 1213 + /* All writers are done so after syncing there won't be dirty data */ 1389 1214 sync_filesystem(sb); 1390 1215 1391 - sb->s_frozen = SB_FREEZE_TRANS; 1216 + /* Now wait for internal filesystem counter */ 1217 + sb->s_writers.frozen = SB_FREEZE_FS; 1392 1218 smp_wmb(); 1219 + sb_wait_write(sb, SB_FREEZE_FS); 1393 1220 1394 - sync_blockdev(sb->s_bdev); 1395 1221 if (sb->s_op->freeze_fs) { 1396 1222 ret = sb->s_op->freeze_fs(sb); 1397 1223 if (ret) { 1398 1224 printk(KERN_ERR 1399 1225 "VFS:Filesystem freeze failed\n"); 1400 - sb->s_frozen = SB_UNFROZEN; 1226 + sb->s_writers.frozen = SB_UNFROZEN; 1401 1227 smp_wmb(); 1402 - wake_up(&sb->s_wait_unfrozen); 1228 + wake_up(&sb->s_writers.wait_unfrozen); 1403 1229 deactivate_locked_super(sb); 1404 1230 return ret; 1405 1231 } 1406 1232 } 1233 + /* 1234 + * This is just for debugging purposes so that fs can warn if it 1235 + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. 1236 + */ 1237 + sb->s_writers.frozen = SB_FREEZE_COMPLETE; 1407 1238 up_write(&sb->s_umount); 1408 1239 return 0; 1409 1240 } ··· 1440 1231 int error; 1441 1232 1442 1233 down_write(&sb->s_umount); 1443 - if (sb->s_frozen == SB_UNFROZEN) { 1234 + if (sb->s_writers.frozen == SB_UNFROZEN) { 1444 1235 up_write(&sb->s_umount); 1445 1236 return -EINVAL; 1446 1237 } ··· 1453 1244 if (error) { 1454 1245 printk(KERN_ERR 1455 1246 "VFS:Filesystem thaw failed\n"); 1456 - sb->s_frozen = SB_FREEZE_TRANS; 1457 1247 up_write(&sb->s_umount); 1458 1248 return error; 1459 1249 } 1460 1250 } 1461 1251 1462 1252 out: 1463 - sb->s_frozen = SB_UNFROZEN; 1253 + sb->s_writers.frozen = SB_UNFROZEN; 1464 1254 smp_wmb(); 1465 - wake_up(&sb->s_wait_unfrozen); 1255 + wake_up(&sb->s_writers.wait_unfrozen); 1466 1256 deactivate_locked_super(sb); 1467 1257 1468 1258 return 0;
+2
fs/sysfs/bin.c
··· 228 228 ret = 0; 229 229 if (bb->vm_ops->page_mkwrite) 230 230 ret = bb->vm_ops->page_mkwrite(vma, vmf); 231 + else 232 + file_update_time(file); 231 233 232 234 sysfs_put_active(attr_sd); 233 235 return ret;
+18
fs/xfs/xfs_aops.c
··· 124 124 ioend->io_append_trans = tp; 125 125 126 126 /* 127 + * We will pass freeze protection with a transaction. So tell lockdep 128 + * we released it. 129 + */ 130 + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 131 + 1, _THIS_IP_); 132 + /* 127 133 * We hand off the transaction to the completion thread now, so 128 134 * clear the flag here. 129 135 */ ··· 205 199 struct xfs_inode *ip = XFS_I(ioend->io_inode); 206 200 int error = 0; 207 201 202 + if (ioend->io_append_trans) { 203 + /* 204 + * We've got freeze protection passed with the transaction. 205 + * Tell lockdep about it. 206 + */ 207 + rwsem_acquire_read( 208 + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 209 + 0, 1, _THIS_IP_); 210 + } 208 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 209 212 ioend->io_error = -EIO; 210 213 goto done; ··· 1440 1425 if (ioend->io_append_trans) { 1441 1426 current_set_flags_nested(&ioend->io_append_trans->t_pflags, 1442 1427 PF_FSTRANS); 1428 + rwsem_acquire_read( 1429 + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 1430 + 0, 1, _THIS_IP_); 1443 1431 xfs_trans_cancel(ioend->io_append_trans, 0); 1444 1432 } 1445 1433 out_destroy_ioend:
+7 -3
fs/xfs/xfs_file.c
··· 770 770 if (ocount == 0) 771 771 return 0; 772 772 773 - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); 773 + sb_start_write(inode->i_sb); 774 774 775 - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 776 - return -EIO; 775 + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 776 + ret = -EIO; 777 + goto out; 778 + } 777 779 778 780 if (unlikely(file->f_flags & O_DIRECT)) 779 781 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); ··· 794 792 ret = err; 795 793 } 796 794 795 + out: 796 + sb_end_write(inode->i_sb); 797 797 return ret; 798 798 } 799 799
+52 -3
fs/xfs/xfs_ioctl.c
··· 364 364 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 365 365 return -XFS_ERROR(EFAULT); 366 366 367 + error = mnt_want_write_file(parfilp); 368 + if (error) 369 + return error; 370 + 367 371 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); 368 - if (IS_ERR(dentry)) 372 + if (IS_ERR(dentry)) { 373 + mnt_drop_write_file(parfilp); 369 374 return PTR_ERR(dentry); 375 + } 370 376 371 377 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { 372 378 error = -XFS_ERROR(EPERM); ··· 388 382 fsd.fsd_dmstate); 389 383 390 384 out: 385 + mnt_drop_write_file(parfilp); 391 386 dput(dentry); 392 387 return error; 393 388 } ··· 641 634 if (ioflags & IO_INVIS) 642 635 attr_flags |= XFS_ATTR_DMI; 643 636 637 + error = mnt_want_write_file(filp); 638 + if (error) 639 + return error; 644 640 error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); 641 + mnt_drop_write_file(filp); 645 642 return -error; 646 643 } 647 644 ··· 1174 1163 { 1175 1164 struct fsxattr fa; 1176 1165 unsigned int mask; 1166 + int error; 1177 1167 1178 1168 if (copy_from_user(&fa, arg, sizeof(fa))) 1179 1169 return -EFAULT; ··· 1183 1171 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 1184 1172 mask |= FSX_NONBLOCK; 1185 1173 1186 - return -xfs_ioctl_setattr(ip, &fa, mask); 1174 + error = mnt_want_write_file(filp); 1175 + if (error) 1176 + return error; 1177 + error = xfs_ioctl_setattr(ip, &fa, mask); 1178 + mnt_drop_write_file(filp); 1179 + return -error; 1187 1180 } 1188 1181 1189 1182 STATIC int ··· 1213 1196 struct fsxattr fa; 1214 1197 unsigned int flags; 1215 1198 unsigned int mask; 1199 + int error; 1216 1200 1217 1201 if (copy_from_user(&flags, arg, sizeof(flags))) 1218 1202 return -EFAULT; ··· 1228 1210 mask |= FSX_NONBLOCK; 1229 1211 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); 1230 1212 1231 - return -xfs_ioctl_setattr(ip, &fa, mask); 1213 + error = mnt_want_write_file(filp); 1214 + if (error) 1215 + return error; 1216 + error = xfs_ioctl_setattr(ip, &fa, mask); 1217 + mnt_drop_write_file(filp); 1218 + return -error; 1232 1219 } 1233 1220 1234 1221 STATIC int ··· 1408 1385 if (copy_from_user(&dmi, arg, sizeof(dmi))) 1409 1386 return -XFS_ERROR(EFAULT); 1410 1387 1388 + error = mnt_want_write_file(filp); 1389 + if (error) 1390 + return error; 1391 + 1411 1392 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, 1412 1393 dmi.fsd_dmstate); 1394 + mnt_drop_write_file(filp); 1413 1395 return -error; 1414 1396 } 1415 1397 ··· 1462 1434 1463 1435 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) 1464 1436 return -XFS_ERROR(EFAULT); 1437 + error = mnt_want_write_file(filp); 1438 + if (error) 1439 + return error; 1465 1440 error = xfs_swapext(&sxp); 1441 + mnt_drop_write_file(filp); 1466 1442 return -error; 1467 1443 } 1468 1444 ··· 1495 1463 if (copy_from_user(&inout, arg, sizeof(inout))) 1496 1464 return -XFS_ERROR(EFAULT); 1497 1465 1466 + error = mnt_want_write_file(filp); 1467 + if (error) 1468 + return error; 1469 + 1498 1470 /* input parameter is passed in resblks field of structure */ 1499 1471 in = inout.resblks; 1500 1472 error = xfs_reserve_blocks(mp, &in, &inout); 1473 + mnt_drop_write_file(filp); 1501 1474 if (error) 1502 1475 return -error; 1503 1476 ··· 1533 1496 if (copy_from_user(&in, arg, sizeof(in))) 1534 1497 return -XFS_ERROR(EFAULT); 1535 1498 1499 + error = mnt_want_write_file(filp); 1500 + if (error) 1501 + return error; 1536 1502 error = xfs_growfs_data(mp, &in); 1503 + mnt_drop_write_file(filp); 1537 1504 return -error; 1538 1505 } 1539 1506 ··· 1547 1506 if (copy_from_user(&in, arg, sizeof(in))) 1548 1507 return -XFS_ERROR(EFAULT); 1549 1508 1509 + error = mnt_want_write_file(filp); 1510 + if (error) 1511 + return error; 1550 1512 error = xfs_growfs_log(mp, &in); 1513 + mnt_drop_write_file(filp); 1551 1514 return -error; 1552 1515 } 1553 1516 ··· 1561 1516 if (copy_from_user(&in, arg, sizeof(in))) 1562 1517 return -XFS_ERROR(EFAULT); 1563 1518 1519 + error = mnt_want_write_file(filp); 1520 + if (error) 1521 + return error; 1564 1522 error = xfs_growfs_rt(mp, &in); 1523 + mnt_drop_write_file(filp); 1565 1524 return -error; 1566 1525 } 1567 1526
+12
fs/xfs/xfs_ioctl32.c
··· 600 600 601 601 if (xfs_compat_growfs_data_copyin(&in, arg)) 602 602 return -XFS_ERROR(EFAULT); 603 + error = mnt_want_write_file(filp); 604 + if (error) 605 + return error; 603 606 error = xfs_growfs_data(mp, &in); 607 + mnt_drop_write_file(filp); 604 608 return -error; 605 609 } 606 610 case XFS_IOC_FSGROWFSRT_32: { ··· 612 608 613 609 if (xfs_compat_growfs_rt_copyin(&in, arg)) 614 610 return -XFS_ERROR(EFAULT); 611 + error = mnt_want_write_file(filp); 612 + if (error) 613 + return error; 615 614 error = xfs_growfs_rt(mp, &in); 615 + mnt_drop_write_file(filp); 616 616 return -error; 617 617 } 618 618 #endif ··· 635 627 offsetof(struct xfs_swapext, sx_stat)) || 636 628 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) 637 629 return -XFS_ERROR(EFAULT); 630 + error = mnt_want_write_file(filp); 631 + if (error) 632 + return error; 638 633 error = xfs_swapext(&sxp); 634 + mnt_drop_write_file(filp); 639 635 return -error; 640 636 } 641 637 case XFS_IOC_FSBULKSTAT_32:
+2 -2
fs/xfs/xfs_iomap.c
··· 680 680 * the same inode that we complete here and might deadlock 681 681 * on the iolock. 682 682 */ 683 - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 683 + sb_start_intwrite(mp->m_super); 684 684 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); 685 - tp->t_flags |= XFS_TRANS_RESERVE; 685 + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; 686 686 error = xfs_trans_reserve(tp, resblks, 687 687 XFS_WRITE_LOG_RES(mp), 0, 688 688 XFS_TRANS_PERM_LOG_RES,
+1 -1
fs/xfs/xfs_mount.c
··· 1551 1551 int 1552 1552 xfs_fs_writable(xfs_mount_t *mp) 1553 1553 { 1554 - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || 1554 + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || 1555 1555 (mp->m_flags & XFS_MOUNT_RDONLY)); 1556 1556 } 1557 1557
-3
fs/xfs/xfs_mount.h
··· 311 311 #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ 312 312 #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ 313 313 314 - #define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) 315 - #define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) 316 - 317 314 /* 318 315 * Flags for xfs_mountfs 319 316 */
+1 -1
fs/xfs/xfs_sync.c
··· 403 403 if (!(mp->m_super->s_flags & MS_ACTIVE) && 404 404 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 405 405 /* dgc: errors ignored here */ 406 - if (mp->m_super->s_frozen == SB_UNFROZEN && 406 + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && 407 407 xfs_log_need_covered(mp)) 408 408 error = xfs_fs_log_dummy(mp); 409 409 else
+14 -3
fs/xfs/xfs_trans.c
··· 576 576 xfs_mount_t *mp, 577 577 uint type) 578 578 { 579 - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 580 - return _xfs_trans_alloc(mp, type, KM_SLEEP); 579 + xfs_trans_t *tp; 580 + 581 + sb_start_intwrite(mp->m_super); 582 + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); 583 + tp->t_flags |= XFS_TRANS_FREEZE_PROT; 584 + return tp; 581 585 } 582 586 583 587 xfs_trans_t * ··· 592 588 { 593 589 xfs_trans_t *tp; 594 590 591 + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); 595 592 atomic_inc(&mp->m_active_trans); 596 593 597 594 tp = kmem_zone_zalloc(xfs_trans_zone, memflags); ··· 616 611 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); 617 612 618 613 atomic_dec(&tp->t_mountp->m_active_trans); 614 + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) 615 + sb_end_intwrite(tp->t_mountp->m_super); 619 616 xfs_trans_free_dqinfo(tp); 620 617 kmem_zone_free(xfs_trans_zone, tp); 621 618 } ··· 650 643 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 651 644 ASSERT(tp->t_ticket != NULL); 652 645 653 - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); 646 + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | 647 + (tp->t_flags & XFS_TRANS_RESERVE) | 648 + (tp->t_flags & XFS_TRANS_FREEZE_PROT); 649 + /* We gave our writer reference to the new transaction */ 650 + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; 654 651 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); 655 652 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; 656 653 tp->t_blk_res = tp->t_blk_res_used;
+2
fs/xfs/xfs_trans.h
··· 179 179 #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ 180 180 #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ 181 181 #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ 182 + #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer 183 + count in superblock */ 182 184 183 185 /* 184 186 * Values for call flags parameter.
+4
include/linux/audit.h
··· 130 130 #define AUDIT_LAST_KERN_ANOM_MSG 1799 131 131 #define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */ 132 132 #define AUDIT_ANOM_ABEND 1701 /* Process ended abnormally */ 133 + #define AUDIT_ANOM_LINK 1702 /* Suspicious use of file links */ 133 134 #define AUDIT_INTEGRITY_DATA 1800 /* Data integrity verification */ 134 135 #define AUDIT_INTEGRITY_METADATA 1801 /* Metadata integrity verification */ 135 136 #define AUDIT_INTEGRITY_STATUS 1802 /* Integrity enable status */ ··· 688 687 const struct path *path); 689 688 extern void audit_log_key(struct audit_buffer *ab, 690 689 char *key); 690 + extern void audit_log_link_denied(const char *operation, 691 + struct path *link); 691 692 extern void audit_log_lost(const char *message); 692 693 #ifdef CONFIG_SECURITY 693 694 extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); ··· 719 716 #define audit_log_untrustedstring(a,s) do { ; } while (0) 720 717 #define audit_log_d_path(b, p, d) do { ; } while (0) 721 718 #define audit_log_key(b, k) do { ; } while (0) 719 + #define audit_log_link_denied(o, l) do { ; } while (0) 722 720 #define audit_log_secctx(b,s) do { ; } while (0) 723 721 #define audit_enabled 0 724 722 #endif
+142 -12
include/linux/fs.h
··· 414 414 #include <linux/shrinker.h> 415 415 #include <linux/migrate_mode.h> 416 416 #include <linux/uidgid.h> 417 + #include <linux/lockdep.h> 417 418 418 419 #include <asm/byteorder.h> 419 420 ··· 441 440 extern int sysctl_nr_open; 442 441 extern struct inodes_stat_t inodes_stat; 443 442 extern int leases_enable, lease_break_time; 443 + extern int sysctl_protected_symlinks; 444 + extern int sysctl_protected_hardlinks; 444 445 445 446 struct buffer_head; 446 447 typedef int (get_block_t)(struct inode *inode, sector_t iblock, ··· 1448 1445 extern pid_t f_getown(struct file *filp); 1449 1446 extern int send_sigurg(struct fown_struct *fown); 1450 1447 1448 + struct mm_struct; 1449 + 1451 1450 /* 1452 1451 * Umount options 1453 1452 */ ··· 1462 1457 1463 1458 extern struct list_head super_blocks; 1464 1459 extern spinlock_t sb_lock; 1460 + 1461 + /* Possible states of 'frozen' field */ 1462 + enum { 1463 + SB_UNFROZEN = 0, /* FS is unfrozen */ 1464 + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ 1465 + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ 1466 + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop 1467 + * internal threads if needed) */ 1468 + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ 1469 + }; 1470 + 1471 + #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) 1472 + 1473 + struct sb_writers { 1474 + /* Counters for counting writers at each level */ 1475 + struct percpu_counter counter[SB_FREEZE_LEVELS]; 1476 + wait_queue_head_t wait; /* queue for waiting for 1477 + writers / faults to finish */ 1478 + int frozen; /* Is sb frozen? */ 1479 + wait_queue_head_t wait_unfrozen; /* queue for waiting for 1480 + sb to be thawed */ 1481 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 1482 + struct lockdep_map lock_map[SB_FREEZE_LEVELS]; 1483 + #endif 1484 + }; 1465 1485 1466 1486 struct super_block { 1467 1487 struct list_head s_list; /* Keep this first */ ··· 1535 1505 struct hlist_node s_instances; 1536 1506 struct quota_info s_dquot; /* Diskquota specific options */ 1537 1507 1538 - int s_frozen; 1539 - wait_queue_head_t s_wait_unfrozen; 1508 + struct sb_writers s_writers; 1540 1509 1541 1510 char s_id[32]; /* Informational name */ 1542 1511 u8 s_uuid[16]; /* UUID */ ··· 1590 1561 /* 1591 1562 * Snapshotting support. 1592 1563 */ 1593 - enum { 1594 - SB_UNFROZEN = 0, 1595 - SB_FREEZE_WRITE = 1, 1596 - SB_FREEZE_TRANS = 2, 1597 - }; 1598 1564 1599 - #define vfs_check_frozen(sb, level) \ 1600 - wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 1565 + void __sb_end_write(struct super_block *sb, int level); 1566 + int __sb_start_write(struct super_block *sb, int level, bool wait); 1567 + 1568 + /** 1569 + * sb_end_write - drop write access to a superblock 1570 + * @sb: the super we wrote to 1571 + * 1572 + * Decrement number of writers to the filesystem. Wake up possible waiters 1573 + * wanting to freeze the filesystem. 1574 + */ 1575 + static inline void sb_end_write(struct super_block *sb) 1576 + { 1577 + __sb_end_write(sb, SB_FREEZE_WRITE); 1578 + } 1579 + 1580 + /** 1581 + * sb_end_pagefault - drop write access to a superblock from a page fault 1582 + * @sb: the super we wrote to 1583 + * 1584 + * Decrement number of processes handling write page fault to the filesystem. 1585 + * Wake up possible waiters wanting to freeze the filesystem. 1586 + */ 1587 + static inline void sb_end_pagefault(struct super_block *sb) 1588 + { 1589 + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); 1590 + } 1591 + 1592 + /** 1593 + * sb_end_intwrite - drop write access to a superblock for internal fs purposes 1594 + * @sb: the super we wrote to 1595 + * 1596 + * Decrement fs-internal number of writers to the filesystem. Wake up possible 1597 + * waiters wanting to freeze the filesystem. 1598 + */ 1599 + static inline void sb_end_intwrite(struct super_block *sb) 1600 + { 1601 + __sb_end_write(sb, SB_FREEZE_FS); 1602 + } 1603 + 1604 + /** 1605 + * sb_start_write - get write access to a superblock 1606 + * @sb: the super we write to 1607 + * 1608 + * When a process wants to write data or metadata to a file system (i.e. dirty 1609 + * a page or an inode), it should embed the operation in a sb_start_write() - 1610 + * sb_end_write() pair to get exclusion against file system freezing. This 1611 + * function increments number of writers preventing freezing. If the file 1612 + * system is already frozen, the function waits until the file system is 1613 + * thawed. 1614 + * 1615 + * Since freeze protection behaves as a lock, users have to preserve 1616 + * ordering of freeze protection and other filesystem locks. Generally, 1617 + * freeze protection should be the outermost lock. In particular, we have: 1618 + * 1619 + * sb_start_write 1620 + * -> i_mutex (write path, truncate, directory ops, ...) 1621 + * -> s_umount (freeze_super, thaw_super) 1622 + */ 1623 + static inline void sb_start_write(struct super_block *sb) 1624 + { 1625 + __sb_start_write(sb, SB_FREEZE_WRITE, true); 1626 + } 1627 + 1628 + static inline int sb_start_write_trylock(struct super_block *sb) 1629 + { 1630 + return __sb_start_write(sb, SB_FREEZE_WRITE, false); 1631 + } 1632 + 1633 + /** 1634 + * sb_start_pagefault - get write access to a superblock from a page fault 1635 + * @sb: the super we write to 1636 + * 1637 + * When a process starts handling write page fault, it should embed the 1638 + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get 1639 + * exclusion against file system freezing. This is needed since the page fault 1640 + * is going to dirty a page. This function increments number of running page 1641 + * faults preventing freezing. If the file system is already frozen, the 1642 + * function waits until the file system is thawed. 1643 + * 1644 + * Since page fault freeze protection behaves as a lock, users have to preserve 1645 + * ordering of freeze protection and other filesystem locks. It is advised to 1646 + * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault 1647 + * handling code implies lock dependency: 1648 + * 1649 + * mmap_sem 1650 + * -> sb_start_pagefault 1651 + */ 1652 + static inline void sb_start_pagefault(struct super_block *sb) 1653 + { 1654 + __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); 1655 + } 1656 + 1657 + /* 1658 + * sb_start_intwrite - get write access to a superblock for internal fs purposes 1659 + * @sb: the super we write to 1660 + * 1661 + * This is the third level of protection against filesystem freezing. It is 1662 + * free for use by a filesystem. The only requirement is that it must rank 1663 + * below sb_start_pagefault. 1664 + * 1665 + * For example filesystem can call sb_start_intwrite() when starting a 1666 + * transaction which somewhat eases handling of freezing for internal sources 1667 + * of filesystem changes (internal fs threads, discarding preallocation on file 1668 + * close, etc.). 1669 + */ 1670 + static inline void sb_start_intwrite(struct super_block *sb) 1671 + { 1672 + __sb_start_write(sb, SB_FREEZE_FS, true); 1673 + } 1674 + 1601 1675 1602 1676 extern bool inode_owner_or_capable(const struct inode *inode); 1603 1677 ··· 2024 1892 struct lock_class_key s_lock_key; 2025 1893 struct lock_class_key s_umount_key; 2026 1894 struct lock_class_key s_vfs_rename_key; 1895 + struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; 2027 1896 2028 1897 struct lock_class_key i_lock_key; 2029 1898 struct lock_class_key i_mutex_key; ··· 2467 2334 } 2468 2335 #endif 2469 2336 extern int do_pipe_flags(int *, int); 2470 - extern struct file *create_read_pipe(struct file *f, int flags); 2471 - extern struct file *create_write_pipe(int flags); 2472 - extern void free_write_pipe(struct file *); 2473 2337 2474 2338 extern int kernel_read(struct file *, loff_t, char *, unsigned long); 2475 2339 extern struct file * open_exec(const char *);
+1
include/linux/mm.h
··· 1441 1441 1442 1442 /* generic vm_area_ops exported for stackable file systems */ 1443 1443 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 1444 + extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1444 1445 1445 1446 /* mm/page-writeback.c */ 1446 1447 int write_one_page(struct page *page, int wait);
+1
include/linux/namei.h
··· 67 67 68 68 extern struct dentry *kern_path_create(int, const char *, struct path *, int); 69 69 extern struct dentry *user_path_create(int, const char __user *, struct path *, int); 70 + extern void done_path_create(struct path *, struct dentry *); 70 71 extern struct dentry *kern_path_locked(const char *, struct path *); 71 72 extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 72 73 const char *, unsigned int, struct path *);
+1
include/linux/nfsd/nfsfh.h
··· 143 143 int fh_maxsize; /* max size for fh_handle */ 144 144 145 145 unsigned char fh_locked; /* inode locked by us */ 146 + unsigned char fh_want_write; /* remount protection taken */ 146 147 147 148 #ifdef CONFIG_NFSD_V3 148 149 unsigned char fh_post_saved; /* post-op attrs saved */
+2
include/linux/pipe_fs_i.h
··· 160 160 long pipe_fcntl(struct file *, unsigned int, unsigned long arg); 161 161 struct pipe_inode_info *get_pipe_info(struct file *file); 162 162 163 + int create_pipe_files(struct file **, int); 164 + 163 165 #endif
+21
kernel/audit.c
··· 1456 1456 } 1457 1457 1458 1458 /** 1459 + * audit_log_link_denied - report a link restriction denial 1460 + * @operation: specific link opreation 1461 + * @link: the path that triggered the restriction 1462 + */ 1463 + void audit_log_link_denied(const char *operation, struct path *link) 1464 + { 1465 + struct audit_buffer *ab; 1466 + 1467 + ab = audit_log_start(current->audit_context, GFP_KERNEL, 1468 + AUDIT_ANOM_LINK); 1469 + audit_log_format(ab, "op=%s action=denied", operation); 1470 + audit_log_format(ab, " pid=%d comm=", current->pid); 1471 + audit_log_untrustedstring(ab, current->comm); 1472 + audit_log_d_path(ab, " path=", link); 1473 + audit_log_format(ab, " dev="); 1474 + audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); 1475 + audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); 1476 + audit_log_end(ab); 1477 + } 1478 + 1479 + /** 1459 1480 * audit_log_end - end one audit record 1460 1481 * @ab: the audit_buffer 1461 1482 *
+18
kernel/sysctl.c
··· 1498 1498 #endif 1499 1499 #endif 1500 1500 { 1501 + .procname = "protected_symlinks", 1502 + .data = &sysctl_protected_symlinks, 1503 + .maxlen = sizeof(int), 1504 + .mode = 0600, 1505 + .proc_handler = proc_dointvec_minmax, 1506 + .extra1 = &zero, 1507 + .extra2 = &one, 1508 + }, 1509 + { 1510 + .procname = "protected_hardlinks", 1511 + .data = &sysctl_protected_hardlinks, 1512 + .maxlen = sizeof(int), 1513 + .mode = 0600, 1514 + .proc_handler = proc_dointvec_minmax, 1515 + .extra1 = &zero, 1516 + .extra2 = &one, 1517 + }, 1518 + { 1501 1519 .procname = "suid_dumpable", 1502 1520 .data = &suid_dumpable, 1503 1521 .maxlen = sizeof(int),
+7 -7
lib/percpu_counter.c
··· 12 12 13 13 #ifdef CONFIG_HOTPLUG_CPU 14 14 static LIST_HEAD(percpu_counters); 15 - static DEFINE_MUTEX(percpu_counters_lock); 15 + static DEFINE_SPINLOCK(percpu_counters_lock); 16 16 #endif 17 17 18 18 #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER ··· 123 123 124 124 #ifdef CONFIG_HOTPLUG_CPU 125 125 INIT_LIST_HEAD(&fbc->list); 126 - mutex_lock(&percpu_counters_lock); 126 + spin_lock(&percpu_counters_lock); 127 127 list_add(&fbc->list, &percpu_counters); 128 - mutex_unlock(&percpu_counters_lock); 128 + spin_unlock(&percpu_counters_lock); 129 129 #endif 130 130 return 0; 131 131 } ··· 139 139 debug_percpu_counter_deactivate(fbc); 140 140 141 141 #ifdef CONFIG_HOTPLUG_CPU 142 - mutex_lock(&percpu_counters_lock); 142 + spin_lock(&percpu_counters_lock); 143 143 list_del(&fbc->list); 144 - mutex_unlock(&percpu_counters_lock); 144 + spin_unlock(&percpu_counters_lock); 145 145 #endif 146 146 free_percpu(fbc->counters); 147 147 fbc->counters = NULL; ··· 170 170 return NOTIFY_OK; 171 171 172 172 cpu = (unsigned long)hcpu; 173 - mutex_lock(&percpu_counters_lock); 173 + spin_lock(&percpu_counters_lock); 174 174 list_for_each_entry(fbc, &percpu_counters, list) { 175 175 s32 *pcount; 176 176 unsigned long flags; ··· 181 181 *pcount = 0; 182 182 raw_spin_unlock_irqrestore(&fbc->lock, flags); 183 183 } 184 - mutex_unlock(&percpu_counters_lock); 184 + spin_unlock(&percpu_counters_lock); 185 185 #endif 186 186 return NOTIFY_OK; 187 187 }
+29 -2
mm/filemap.c
··· 1712 1712 } 1713 1713 EXPORT_SYMBOL(filemap_fault); 1714 1714 1715 + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1716 + { 1717 + struct page *page = vmf->page; 1718 + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1719 + int ret = VM_FAULT_LOCKED; 1720 + 1721 + sb_start_pagefault(inode->i_sb); 1722 + file_update_time(vma->vm_file); 1723 + lock_page(page); 1724 + if (page->mapping != inode->i_mapping) { 1725 + unlock_page(page); 1726 + ret = VM_FAULT_NOPAGE; 1727 + goto out; 1728 + } 1729 + /* 1730 + * We mark the page dirty already here so that when freeze is in 1731 + * progress, we are guaranteed that writeback during freezing will 1732 + * see the dirty page and writeprotect it again. 1733 + */ 1734 + set_page_dirty(page); 1735 + out: 1736 + sb_end_pagefault(inode->i_sb); 1737 + return ret; 1738 + } 1739 + EXPORT_SYMBOL(filemap_page_mkwrite); 1740 + 1715 1741 const struct vm_operations_struct generic_file_vm_ops = { 1716 1742 .fault = filemap_fault, 1743 + .page_mkwrite = filemap_page_mkwrite, 1717 1744 }; 1718 1745 1719 1746 /* This is used for a general mmap of a disk file */ ··· 2434 2407 count = ocount; 2435 2408 pos = *ppos; 2436 2409 2437 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2438 - 2439 2410 /* We can write back this queue in page reclaim */ 2440 2411 current->backing_dev_info = mapping->backing_dev_info; 2441 2412 written = 0; ··· 2532 2507 2533 2508 BUG_ON(iocb->ki_pos != pos); 2534 2509 2510 + sb_start_write(inode->i_sb); 2535 2511 mutex_lock(&inode->i_mutex); 2536 2512 blk_start_plug(&plug); 2537 2513 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); ··· 2546 2520 ret = err; 2547 2521 } 2548 2522 blk_finish_plug(&plug); 2523 + sb_end_write(inode->i_sb); 2549 2524 return ret; 2550 2525 } 2551 2526 EXPORT_SYMBOL(generic_file_aio_write);
+4 -2
mm/filemap_xip.c
··· 304 304 305 305 static const struct vm_operations_struct xip_file_vm_ops = { 306 306 .fault = xip_file_fault, 307 + .page_mkwrite = filemap_page_mkwrite, 307 308 }; 308 309 309 310 int xip_file_mmap(struct file * file, struct vm_area_struct * vma) ··· 402 401 loff_t pos; 403 402 ssize_t ret; 404 403 404 + sb_start_write(inode->i_sb); 405 + 405 406 mutex_lock(&inode->i_mutex); 406 407 407 408 if (!access_ok(VERIFY_READ, buf, len)) { ··· 413 410 414 411 pos = *ppos; 415 412 count = len; 416 - 417 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 418 413 419 414 /* We can write back this queue in page reclaim */ 420 415 current->backing_dev_info = mapping->backing_dev_info; ··· 437 436 current->backing_dev_info = NULL; 438 437 out_up: 439 438 mutex_unlock(&inode->i_mutex); 439 + sb_end_write(inode->i_sb); 440 440 return ret; 441 441 } 442 442 EXPORT_SYMBOL_GPL(xip_file_write);
+7 -7
mm/memory.c
··· 2650 2650 if (!page_mkwrite) { 2651 2651 wait_on_page_locked(dirty_page); 2652 2652 set_page_dirty_balance(dirty_page, page_mkwrite); 2653 + /* file_update_time outside page_lock */ 2654 + if (vma->vm_file) 2655 + file_update_time(vma->vm_file); 2653 2656 } 2654 2657 put_page(dirty_page); 2655 2658 if (page_mkwrite) { ··· 2669 2666 balance_dirty_pages_ratelimited(mapping); 2670 2667 } 2671 2668 } 2672 - 2673 - /* file_update_time outside page_lock */ 2674 - if (vma->vm_file) 2675 - file_update_time(vma->vm_file); 2676 2669 2677 2670 return ret; 2678 2671 } ··· 3338 3339 3339 3340 if (dirty_page) { 3340 3341 struct address_space *mapping = page->mapping; 3342 + int dirtied = 0; 3341 3343 3342 3344 if (set_page_dirty(dirty_page)) 3343 - page_mkwrite = 1; 3345 + dirtied = 1; 3344 3346 unlock_page(dirty_page); 3345 3347 put_page(dirty_page); 3346 - if (page_mkwrite && mapping) { 3348 + if ((dirtied || page_mkwrite) && mapping) { 3347 3349 /* 3348 3350 * Some device drivers do not set page.mapping but still 3349 3351 * dirty their pages ··· 3353 3353 } 3354 3354 3355 3355 /* file_update_time outside page_lock */ 3356 - if (vma->vm_file) 3356 + if (vma->vm_file && !page_mkwrite) 3357 3357 file_update_time(vma->vm_file); 3358 3358 } else { 3359 3359 unlock_page(vmf.page);
+43 -50
net/unix/af_unix.c
··· 823 823 return NULL; 824 824 } 825 825 826 + static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) 827 + { 828 + struct dentry *dentry; 829 + struct path path; 830 + int err = 0; 831 + /* 832 + * Get the parent directory, calculate the hash for last 833 + * component. 834 + */ 835 + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); 836 + err = PTR_ERR(dentry); 837 + if (IS_ERR(dentry)) 838 + return err; 839 + 840 + /* 841 + * All right, let's create it. 842 + */ 843 + err = security_path_mknod(&path, dentry, mode, 0); 844 + if (!err) { 845 + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 846 + if (!err) { 847 + res->mnt = mntget(path.mnt); 848 + res->dentry = dget(dentry); 849 + } 850 + } 851 + done_path_create(&path, dentry); 852 + return err; 853 + } 826 854 827 855 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 828 856 { ··· 859 831 struct unix_sock *u = unix_sk(sk); 860 832 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 861 833 char *sun_path = sunaddr->sun_path; 862 - struct dentry *dentry = NULL; 863 - struct path path; 864 834 int err; 865 835 unsigned int hash; 866 836 struct unix_address *addr; ··· 895 869 atomic_set(&addr->refcnt, 1); 896 870 897 871 if (sun_path[0]) { 898 - umode_t mode; 899 - err = 0; 900 - /* 901 - * Get the parent directory, calculate the hash for last 902 - * component. 903 - */ 904 - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); 905 - err = PTR_ERR(dentry); 906 - if (IS_ERR(dentry)) 907 - goto out_mknod_parent; 908 - 909 - /* 910 - * All right, let's create it. 911 - */ 912 - mode = S_IFSOCK | 872 + struct path path; 873 + umode_t mode = S_IFSOCK | 913 874 (SOCK_INODE(sock)->i_mode & ~current_umask()); 914 - err = mnt_want_write(path.mnt); 915 - if (err) 916 - goto out_mknod_dput; 917 - err = security_path_mknod(&path, dentry, mode, 0); 918 - if (err) 919 - goto out_mknod_drop_write; 920 - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 921 - out_mknod_drop_write: 922 - mnt_drop_write(path.mnt); 923 - if (err) 924 - goto out_mknod_dput; 925 - mutex_unlock(&path.dentry->d_inode->i_mutex); 926 - dput(path.dentry); 927 - path.dentry = dentry; 928 - 875 + err = unix_mknod(sun_path, mode, &path); 876 + if (err) { 877 + if (err == -EEXIST) 878 + err = -EADDRINUSE; 879 + unix_release_addr(addr); 880 + goto out_up; 881 + } 929 882 addr->hash = UNIX_HASH_SIZE; 930 - } 931 - 932 - spin_lock(&unix_table_lock); 933 - 934 - if (!sun_path[0]) { 883 + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); 884 + spin_lock(&unix_table_lock); 885 + u->path = path; 886 + list = &unix_socket_table[hash]; 887 + } else { 888 + spin_lock(&unix_table_lock); 935 889 err = -EADDRINUSE; 936 890 if (__unix_find_socket_byname(net, sunaddr, addr_len, 937 891 sk->sk_type, hash)) { ··· 920 914 } 921 915 922 916 list = &unix_socket_table[addr->hash]; 923 - } else { 924 - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; 925 - u->path = path; 926 917 } 927 918 928 919 err = 0; ··· 933 930 mutex_unlock(&u->readlock); 934 931 out: 935 932 return err; 936 - 937 - out_mknod_dput: 938 - dput(dentry); 939 - mutex_unlock(&path.dentry->d_inode->i_mutex); 940 - path_put(&path); 941 - out_mknod_parent: 942 - if (err == -EEXIST) 943 - err = -EADDRINUSE; 944 - unix_release_addr(addr); 945 - goto out_up; 946 933 } 947 934 948 935 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
+4 -4
sound/sound_firmware.c
··· 23 23 if (l <= 0 || l > 131072) 24 24 { 25 25 printk(KERN_INFO "Invalid firmware '%s'\n", fn); 26 - filp_close(filp, current->files); 26 + filp_close(filp, NULL); 27 27 return 0; 28 28 } 29 29 dp = vmalloc(l); 30 30 if (dp == NULL) 31 31 { 32 32 printk(KERN_INFO "Out of memory loading '%s'.\n", fn); 33 - filp_close(filp, current->files); 33 + filp_close(filp, NULL); 34 34 return 0; 35 35 } 36 36 pos = 0; ··· 38 38 { 39 39 printk(KERN_INFO "Failed to read '%s'.\n", fn); 40 40 vfree(dp); 41 - filp_close(filp, current->files); 41 + filp_close(filp, NULL); 42 42 return 0; 43 43 } 44 - filp_close(filp, current->files); 44 + filp_close(filp, NULL); 45 45 *fp = dp; 46 46 return (int) l; 47 47 }