Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

+2 -2

Documentation/filesystems/Locking

··· 138 138 put_super: write 139 139 write_super: read 140 140 sync_fs: read 141 - freeze_fs: read 142 - unfreeze_fs: read 141 + freeze_fs: write 142 + unfreeze_fs: write 143 143 statfs: maybe(read) (see below) 144 144 remount_fs: write 145 145 umount_begin: no

+42

Documentation/sysctl/fs.txt

··· 32 32 - nr_open 33 33 - overflowuid 34 34 - overflowgid 35 + - protected_hardlinks 36 + - protected_symlinks 35 37 - suid_dumpable 36 38 - super-max 37 39 - super-nr ··· 156 154 157 155 These sysctls allow you to change the value of the fixed UID and GID. 158 156 The default is 65534. 157 + 158 + ============================================================== 159 + 160 + protected_hardlinks: 161 + 162 + A long-standing class of security issues is the hardlink-based 163 + time-of-check-time-of-use race, most commonly seen in world-writable 164 + directories like /tmp. The common method of exploitation of this flaw 165 + is to cross privilege boundaries when following a given hardlink (i.e. a 166 + root process follows a hardlink created by another user). Additionally, 167 + on systems without separated partitions, this stops unauthorized users 168 + from "pinning" vulnerable setuid/setgid files against being upgraded by 169 + the administrator, or linking to special files. 170 + 171 + When set to "0", hardlink creation behavior is unrestricted. 172 + 173 + When set to "1" hardlinks cannot be created by users if they do not 174 + already own the source file, or do not have read/write access to it. 175 + 176 + This protection is based on the restrictions in Openwall and grsecurity. 177 + 178 + ============================================================== 179 + 180 + protected_symlinks: 181 + 182 + A long-standing class of security issues is the symlink-based 183 + time-of-check-time-of-use race, most commonly seen in world-writable 184 + directories like /tmp. The common method of exploitation of this flaw 185 + is to cross privilege boundaries when following a given symlink (i.e. a 186 + root process follows a symlink belonging to another user). For a likely 187 + incomplete list of hundreds of examples across the years, please see: 188 + http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp 189 + 190 + When set to "0", symlink following behavior is unrestricted. 191 + 192 + When set to "1" symlinks are permitted to be followed only when outside 193 + a sticky world-writable directory, or when the uid of the symlink and 194 + follower match, or when the directory owner matches the symlink's owner. 195 + 196 + This protection is based on the restrictions in Openwall and grsecurity. 159 197 160 198 ============================================================== 161 199

+25 -52

arch/powerpc/platforms/cell/spufs/inode.c

··· 186 186 static int spufs_rmdir(struct inode *parent, struct dentry *dir) 187 187 { 188 188 /* remove all entries */ 189 + int res; 189 190 spufs_prune_dir(dir); 190 191 d_drop(dir); 191 - 192 - return simple_rmdir(parent, dir); 192 + res = simple_rmdir(parent, dir); 193 + /* We have to give up the mm_struct */ 194 + spu_forget(SPUFS_I(dir->d_inode)->i_ctx); 195 + return res; 193 196 } 194 197 195 198 static int spufs_fill_dir(struct dentry *dir, ··· 247 244 ret = spufs_rmdir(parent, dir); 248 245 mutex_unlock(&parent->i_mutex); 249 246 WARN_ON(ret); 250 - 251 - /* We have to give up the mm_struct */ 252 - spu_forget(ctx); 253 247 254 248 return dcache_dir_close(inode, file); 255 249 } ··· 450 450 struct spu_context *neighbor; 451 451 struct path path = {.mnt = mnt, .dentry = dentry}; 452 452 453 - ret = -EPERM; 454 453 if ((flags & SPU_CREATE_NOSCHED) && 455 454 !capable(CAP_SYS_NICE)) 456 - goto out_unlock; 455 + return -EPERM; 457 456 458 - ret = -EINVAL; 459 457 if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE)) 460 458 == SPU_CREATE_ISOLATE) 461 - goto out_unlock; 459 + return -EINVAL; 462 460 463 - ret = -ENODEV; 464 461 if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader) 465 - goto out_unlock; 462 + return -ENODEV; 466 463 467 464 gang = NULL; 468 465 neighbor = NULL; 469 466 affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU); 470 467 if (affinity) { 471 468 gang = SPUFS_I(inode)->i_gang; 472 - ret = -EINVAL; 473 469 if (!gang) 474 - goto out_unlock; 470 + return -EINVAL; 475 471 mutex_lock(&gang->aff_mutex); 476 472 neighbor = spufs_assert_affinity(flags, gang, aff_filp); 477 473 if (IS_ERR(neighbor)) { ··· 488 492 } 489 493 490 494 ret = spufs_context_open(&path); 491 - if (ret < 0) { 495 + if (ret < 0) 492 496 WARN_ON(spufs_rmdir(inode, dentry)); 493 - if (affinity) 494 - mutex_unlock(&gang->aff_mutex); 495 - mutex_unlock(&inode->i_mutex); 496 - spu_forget(SPUFS_I(dentry->d_inode)->i_ctx); 497 - goto out; 498 - } 499 497 500 498 out_aff_unlock: 501 499 if (affinity) 502 500 mutex_unlock(&gang->aff_mutex); 503 - out_unlock: 504 - mutex_unlock(&inode->i_mutex); 505 - out: 506 - dput(dentry); 507 501 return ret; 508 502 } 509 503 ··· 566 580 int ret; 567 581 568 582 ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO); 569 - if (ret) 570 - goto out; 571 - 572 - ret = spufs_gang_open(&path); 573 - if (ret < 0) { 574 - int err = simple_rmdir(inode, dentry); 575 - WARN_ON(err); 583 + if (!ret) { 584 + ret = spufs_gang_open(&path); 585 + if (ret < 0) { 586 + int err = simple_rmdir(inode, dentry); 587 + WARN_ON(err); 588 + } 576 589 } 577 - 578 - out: 579 - mutex_unlock(&inode->i_mutex); 580 - dput(dentry); 581 590 return ret; 582 591 } 583 592 ··· 582 601 long spufs_create(struct path *path, struct dentry *dentry, 583 602 unsigned int flags, umode_t mode, struct file *filp) 584 603 { 604 + struct inode *dir = path->dentry->d_inode; 585 605 int ret; 586 606 587 - ret = -EINVAL; 588 607 /* check if we are on spufs */ 589 608 if (path->dentry->d_sb->s_type != &spufs_type) 590 - goto out; 609 + return -EINVAL; 591 610 592 611 /* don't accept undefined flags */ 593 612 if (flags & (~SPU_CREATE_FLAG_ALL)) 594 - goto out; 613 + return -EINVAL; 595 614 596 615 /* only threads can be underneath a gang */ 597 - if (path->dentry != path->dentry->d_sb->s_root) { 598 - if ((flags & SPU_CREATE_GANG) || 599 - !SPUFS_I(path->dentry->d_inode)->i_gang) 600 - goto out; 601 - } 616 + if (path->dentry != path->dentry->d_sb->s_root) 617 + if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang) 618 + return -EINVAL; 602 619 603 620 mode &= ~current_umask(); 604 621 605 622 if (flags & SPU_CREATE_GANG) 606 - ret = spufs_create_gang(path->dentry->d_inode, 607 - dentry, path->mnt, mode); 623 + ret = spufs_create_gang(dir, dentry, path->mnt, mode); 608 624 else 609 - ret = spufs_create_context(path->dentry->d_inode, 610 - dentry, path->mnt, flags, mode, 625 + ret = spufs_create_context(dir, dentry, path->mnt, flags, mode, 611 626 filp); 612 627 if (ret >= 0) 613 - fsnotify_mkdir(path->dentry->d_inode, dentry); 614 - return ret; 628 + fsnotify_mkdir(dir, dentry); 615 629 616 - out: 617 - mutex_unlock(&path->dentry->d_inode->i_mutex); 618 - dput(dentry); 619 630 return ret; 620 631 } 621 632

+1 -1

arch/powerpc/platforms/cell/spufs/syscalls.c

··· 70 70 ret = PTR_ERR(dentry); 71 71 if (!IS_ERR(dentry)) { 72 72 ret = spufs_create(&path, dentry, flags, mode, neighbor); 73 - path_put(&path); 73 + done_path_create(&path, dentry); 74 74 } 75 75 76 76 return ret;

+2 -7

drivers/base/devtmpfs.c

··· 156 156 if (!err) 157 157 /* mark as kernel-created inode */ 158 158 dentry->d_inode->i_private = &thread; 159 - dput(dentry); 160 - mutex_unlock(&path.dentry->d_inode->i_mutex); 161 - path_put(&path); 159 + done_path_create(&path, dentry); 162 160 return err; 163 161 } 164 162 ··· 216 218 /* mark as kernel-created inode */ 217 219 dentry->d_inode->i_private = &thread; 218 220 } 219 - dput(dentry); 220 - 221 - mutex_unlock(&path.dentry->d_inode->i_mutex); 222 - path_put(&path); 221 + done_path_create(&path, dentry); 223 222 return err; 224 223 } 225 224

+1 -1

drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c

··· 1188 1188 kfree(buf); 1189 1189 /* close file before return */ 1190 1190 if (fp) 1191 - filp_close(fp, current->files); 1191 + filp_close(fp, NULL); 1192 1192 /* restore previous address limit */ 1193 1193 set_fs(old_fs); 1194 1194

+5 -26

drivers/staging/bcm/Misc.c

··· 157 157 158 158 static struct file *open_firmware_file(struct bcm_mini_adapter *Adapter, const char *path) 159 159 { 160 - struct file *flp = NULL; 161 - mm_segment_t oldfs; 162 - oldfs = get_fs(); 163 - set_fs(get_ds()); 164 - flp = filp_open(path, O_RDONLY, S_IRWXU); 165 - set_fs(oldfs); 160 + struct file *flp = filp_open(path, O_RDONLY, S_IRWXU); 166 161 if (IS_ERR(flp)) { 167 162 pr_err(DRV_NAME "Unable To Open File %s, err %ld", path, PTR_ERR(flp)); 168 163 flp = NULL; ··· 178 183 { 179 184 int errorno = 0; 180 185 struct file *flp = NULL; 181 - mm_segment_t oldfs; 182 186 struct timeval tv = {0}; 183 187 184 188 flp = open_firmware_file(Adapter, path); 185 189 if (!flp) { 186 - errorno = -ENOENT; 187 190 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Unable to Open %s\n", path); 188 - goto exit_download; 191 + return -ENOENT; 189 192 } 190 193 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Opened file is = %s and length =0x%lx to be downloaded at =0x%x", path, (unsigned long)flp->f_dentry->d_inode->i_size, loc); 191 194 do_gettimeofday(&tv); ··· 194 201 errorno = -EIO; 195 202 goto exit_download; 196 203 } 197 - oldfs = get_fs(); 198 - set_fs(get_ds()); 199 204 vfs_llseek(flp, 0, 0); 200 - set_fs(oldfs); 201 205 if (Adapter->bcm_file_readback_from_chip(Adapter->pvInterfaceAdapter, flp, loc)) { 202 206 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Failed to read back firmware!"); 203 207 errorno = -EIO; ··· 202 212 } 203 213 204 214 exit_download: 205 - oldfs = get_fs(); 206 - set_fs(get_ds()); 207 - if (flp && !(IS_ERR(flp))) 208 - filp_close(flp, current->files); 209 - set_fs(oldfs); 210 - 215 + filp_close(flp, NULL); 211 216 return errorno; 212 217 } 213 218 ··· 1041 1056 static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter) 1042 1057 { 1043 1058 struct file *flp = NULL; 1044 - mm_segment_t oldfs = {0}; 1045 1059 char *buff; 1046 1060 int len = 0; 1047 - loff_t pos = 0; 1048 1061 1049 1062 buff = kmalloc(BUFFER_1K, GFP_KERNEL); 1050 1063 if (!buff) ··· 1062 1079 Adapter->pstargetparams = NULL; 1063 1080 return -ENOENT; 1064 1081 } 1065 - oldfs = get_fs(); 1066 - set_fs(get_ds()); 1067 - len = vfs_read(flp, (void __user __force *)buff, BUFFER_1K, &pos); 1068 - set_fs(oldfs); 1082 + len = kernel_read(flp, 0, buff, BUFFER_1K); 1083 + filp_close(flp, NULL); 1069 1084 1070 1085 if (len != sizeof(STARGETPARAMS)) { 1071 1086 BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Mismatch in Target Param Structure!\n"); 1072 1087 kfree(buff); 1073 1088 kfree(Adapter->pstargetparams); 1074 1089 Adapter->pstargetparams = NULL; 1075 - filp_close(flp, current->files); 1076 1090 return -ENOENT; 1077 1091 } 1078 - filp_close(flp, current->files); 1079 1092 1080 1093 /* Check for autolink in config params */ 1081 1094 /*

+3 -4

drivers/staging/gdm72xx/sdio_boot.c

··· 66 66 return -ENOENT; 67 67 } 68 68 69 - if (filp->f_dentry) 70 - inode = filp->f_dentry->d_inode; 71 - if (!inode || !S_ISREG(inode->i_mode)) { 69 + inode = filp->f_dentry->d_inode; 70 + if (!S_ISREG(inode->i_mode)) { 72 71 printk(KERN_ERR "Invalid file type: %s\n", img_name); 73 72 ret = -EINVAL; 74 73 goto out; ··· 122 123 pno++; 123 124 } 124 125 out: 125 - filp_close(filp, current->files); 126 + filp_close(filp, NULL); 126 127 return ret; 127 128 } 128 129

+9 -13

drivers/staging/gdm72xx/usb_boot.c

··· 173 173 filp = filp_open(img_name, O_RDONLY | O_LARGEFILE, 0); 174 174 if (IS_ERR(filp)) { 175 175 printk(KERN_ERR "Can't find %s.\n", img_name); 176 - set_fs(fs); 177 176 ret = PTR_ERR(filp); 178 177 goto restore_fs; 179 178 } 180 179 181 - if (filp->f_dentry) 182 - inode = filp->f_dentry->d_inode; 183 - if (!inode || !S_ISREG(inode->i_mode)) { 180 + inode = filp->f_dentry->d_inode; 181 + if (!S_ISREG(inode->i_mode)) { 184 182 printk(KERN_ERR "Invalid file type: %s\n", img_name); 185 183 ret = -EINVAL; 186 184 goto out; ··· 260 262 ret = -EINVAL; 261 263 } 262 264 out: 263 - filp_close(filp, current->files); 265 + filp_close(filp, NULL); 264 266 265 267 restore_fs: 266 268 set_fs(fs); ··· 320 322 goto restore_fs; 321 323 } 322 324 323 - if (filp->f_dentry) { 324 - inode = filp->f_dentry->d_inode; 325 - if (!inode || !S_ISREG(inode->i_mode)) { 326 - printk(KERN_ERR "Invalid file type: %s\n", path); 327 - ret = -EINVAL; 328 - goto out; 329 - } 325 + inode = filp->f_dentry->d_inode; 326 + if (!S_ISREG(inode->i_mode)) { 327 + printk(KERN_ERR "Invalid file type: %s\n", path); 328 + ret = -EINVAL; 329 + goto out; 330 330 } 331 331 332 332 buf = kmalloc(DOWNLOAD_CHUCK + pad_size, GFP_KERNEL); ··· 360 364 goto out; 361 365 362 366 out: 363 - filp_close(filp, current->files); 367 + filp_close(filp, NULL); 364 368 365 369 restore_fs: 366 370 set_fs(fs);

+5 -27

drivers/target/target_core_file.c

··· 109 109 struct se_subsystem_dev *se_dev, 110 110 void *p) 111 111 { 112 - char *dev_p = NULL; 113 112 struct se_device *dev; 114 113 struct se_dev_limits dev_limits; 115 114 struct queue_limits *limits; 116 115 struct fd_dev *fd_dev = p; 117 116 struct fd_host *fd_host = hba->hba_ptr; 118 - mm_segment_t old_fs; 119 117 struct file *file; 120 118 struct inode *inode = NULL; 121 119 int dev_flags = 0, flags, ret = -EINVAL; 122 120 123 121 memset(&dev_limits, 0, sizeof(struct se_dev_limits)); 124 122 125 - old_fs = get_fs(); 126 - set_fs(get_ds()); 127 - dev_p = getname(fd_dev->fd_dev_name); 128 - set_fs(old_fs); 129 - 130 - if (IS_ERR(dev_p)) { 131 - pr_err("getname(%s) failed: %lu\n", 132 - fd_dev->fd_dev_name, IS_ERR(dev_p)); 133 - ret = PTR_ERR(dev_p); 134 - goto fail; 135 - } 136 123 /* 137 124 * Use O_DSYNC by default instead of O_SYNC to forgo syncing 138 125 * of pure timestamp updates. 139 126 */ 140 127 flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; 141 128 142 - file = filp_open(dev_p, flags, 0600); 129 + file = filp_open(fd_dev->fd_dev_name, flags, 0600); 143 130 if (IS_ERR(file)) { 144 - pr_err("filp_open(%s) failed\n", dev_p); 131 + pr_err("filp_open(%s) failed\n", fd_dev->fd_dev_name); 145 132 ret = PTR_ERR(file); 146 - goto fail; 147 - } 148 - if (!file || !file->f_dentry) { 149 - pr_err("filp_open(%s) failed\n", dev_p); 150 133 goto fail; 151 134 } 152 135 fd_dev->fd_file = file; ··· 195 212 " %llu total bytes\n", fd_host->fd_host_id, fd_dev->fd_dev_id, 196 213 fd_dev->fd_dev_name, fd_dev->fd_dev_size); 197 214 198 - putname(dev_p); 199 215 return dev; 200 216 fail: 201 217 if (fd_dev->fd_file) { 202 218 filp_close(fd_dev->fd_file, NULL); 203 219 fd_dev->fd_file = NULL; 204 220 } 205 - putname(dev_p); 206 221 return ERR_PTR(ret); 207 222 } 208 223 ··· 433 452 token = match_token(ptr, tokens, args); 434 453 switch (token) { 435 454 case Opt_fd_dev_name: 436 - arg_p = match_strdup(&args[0]); 437 - if (!arg_p) { 438 - ret = -ENOMEM; 455 + if (match_strlcpy(fd_dev->fd_dev_name, &args[0], 456 + FD_MAX_DEV_NAME) == 0) { 457 + ret = -EINVAL; 439 458 break; 440 459 } 441 - snprintf(fd_dev->fd_dev_name, FD_MAX_DEV_NAME, 442 - "%s", arg_p); 443 - kfree(arg_p); 444 460 pr_debug("FILEIO: Referencing Path: %s\n", 445 461 fd_dev->fd_dev_name); 446 462 fd_dev->fbd_flags |= FBDF_HAS_PATH;

+5 -7

drivers/usb/gadget/storage_common.c

··· 656 656 if (!(filp->f_mode & FMODE_WRITE)) 657 657 ro = 1; 658 658 659 - if (filp->f_path.dentry) 660 - inode = filp->f_path.dentry->d_inode; 661 - if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { 659 + inode = filp->f_path.dentry->d_inode; 660 + if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { 662 661 LINFO(curlun, "invalid file type: %s\n", filename); 663 662 goto out; 664 663 } ··· 666 667 * If we can't read the file, it's no good. 667 668 * If we can't write the file, use it read-only. 668 669 */ 669 - if (!filp->f_op || !(filp->f_op->read || filp->f_op->aio_read)) { 670 + if (!(filp->f_op->read || filp->f_op->aio_read)) { 670 671 LINFO(curlun, "file not readable: %s\n", filename); 671 672 goto out; 672 673 } ··· 711 712 if (fsg_lun_is_open(curlun)) 712 713 fsg_lun_close(curlun); 713 714 714 - get_file(filp); 715 715 curlun->blksize = blksize; 716 716 curlun->blkbits = blkbits; 717 717 curlun->ro = ro; ··· 718 720 curlun->file_length = size; 719 721 curlun->num_sectors = num_sectors; 720 722 LDBG(curlun, "open backing file: %s\n", filename); 721 - rc = 0; 723 + return 0; 722 724 723 725 out: 724 - filp_close(filp, current->files); 726 + fput(filp); 725 727 return rc; 726 728 } 727 729

+3 -3

drivers/usb/gadget/u_uac1.c

··· 275 275 /* Close control device */ 276 276 snd = &gau->control; 277 277 if (snd->filp) 278 - filp_close(snd->filp, current->files); 278 + filp_close(snd->filp, NULL); 279 279 280 280 /* Close PCM playback device and setup substream */ 281 281 snd = &gau->playback; 282 282 if (snd->filp) 283 - filp_close(snd->filp, current->files); 283 + filp_close(snd->filp, NULL); 284 284 285 285 /* Close PCM capture device and setup substream */ 286 286 snd = &gau->capture; 287 287 if (snd->filp) 288 - filp_close(snd->filp, current->files); 288 + filp_close(snd->filp, NULL); 289 289 290 290 return 0; 291 291 }

+2

drivers/video/fb_defio.c

··· 104 104 deferred framebuffer IO. then if userspace touches a page 105 105 again, we repeat the same scheme */ 106 106 107 + file_update_time(vma->vm_file); 108 + 107 109 /* protect against the workqueue changing the page list */ 108 110 mutex_lock(&fbdefio->lock); 109 111

+3

fs/9p/vfs_file.c

··· 610 610 p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", 611 611 page, (unsigned long)filp->private_data); 612 612 613 + /* Update file times before taking page lock */ 614 + file_update_time(filp); 615 + 613 616 v9inode = V9FS_I(inode); 614 617 /* make sure the cache has finished storing the page */ 615 618 v9fs_fscache_wait_on_page_write(inode, page);

-3

fs/btrfs/disk-io.c

··· 1614 1614 struct btrfs_root *root = arg; 1615 1615 1616 1616 do { 1617 - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1618 - 1619 1617 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1620 1618 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1621 1619 btrfs_run_delayed_iputs(root); ··· 1645 1647 do { 1646 1648 cannot_commit = false; 1647 1649 delay = HZ * 30; 1648 - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1649 1650 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1650 1651 1651 1652 spin_lock(&root->fs_info->trans_lock);

+2 -1

fs/btrfs/file.c

··· 1379 1379 ssize_t err = 0; 1380 1380 size_t count, ocount; 1381 1381 1382 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1382 + sb_start_write(inode->i_sb); 1383 1383 1384 1384 mutex_lock(&inode->i_mutex); 1385 1385 ··· 1469 1469 num_written = err; 1470 1470 } 1471 1471 out: 1472 + sb_end_write(inode->i_sb); 1472 1473 current->backing_dev_info = NULL; 1473 1474 return num_written ? num_written : err; 1474 1475 }

+5 -1

fs/btrfs/inode.c

··· 6629 6629 u64 page_start; 6630 6630 u64 page_end; 6631 6631 6632 + sb_start_pagefault(inode->i_sb); 6632 6633 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6633 6634 if (!ret) { 6634 6635 ret = file_update_time(vma->vm_file); ··· 6719 6718 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6720 6719 6721 6720 out_unlock: 6722 - if (!ret) 6721 + if (!ret) { 6722 + sb_end_pagefault(inode->i_sb); 6723 6723 return VM_FAULT_LOCKED; 6724 + } 6724 6725 unlock_page(page); 6725 6726 out: 6726 6727 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6727 6728 out_noreserve: 6729 + sb_end_pagefault(inode->i_sb); 6728 6730 return ret; 6729 6731 } 6730 6732

+10 -5

fs/btrfs/ioctl.c

··· 195 195 if (!inode_owner_or_capable(inode)) 196 196 return -EACCES; 197 197 198 + ret = mnt_want_write_file(file); 199 + if (ret) 200 + return ret; 201 + 198 202 mutex_lock(&inode->i_mutex); 199 203 200 204 ip_oldflags = ip->flags; ··· 212 208 goto out_unlock; 213 209 } 214 210 } 215 - 216 - ret = mnt_want_write_file(file); 217 - if (ret) 218 - goto out_unlock; 219 211 220 212 if (flags & FS_SYNC_FL) 221 213 ip->flags |= BTRFS_INODE_SYNC; ··· 275 275 inode->i_flags = i_oldflags; 276 276 } 277 277 278 - mnt_drop_write_file(file); 279 278 out_unlock: 280 279 mutex_unlock(&inode->i_mutex); 280 + mnt_drop_write_file(file); 281 281 return ret; 282 282 } 283 283 ··· 664 664 struct dentry *dentry; 665 665 int error; 666 666 667 + error = mnt_want_write(parent->mnt); 668 + if (error) 669 + return error; 670 + 667 671 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 668 672 669 673 dentry = lookup_one_len(name, parent->dentry, namelen); ··· 703 699 dput(dentry); 704 700 out_unlock: 705 701 mutex_unlock(&dir->i_mutex); 702 + mnt_drop_write(parent->mnt); 706 703 return error; 707 704 } 708 705

+7

fs/btrfs/transaction.c

··· 335 335 if (!h) 336 336 return ERR_PTR(-ENOMEM); 337 337 338 + sb_start_intwrite(root->fs_info->sb); 339 + 338 340 if (may_wait_transaction(root, type)) 339 341 wait_current_trans(root); 340 342 ··· 347 345 } while (ret == -EBUSY); 348 346 349 347 if (ret < 0) { 348 + sb_end_intwrite(root->fs_info->sb); 350 349 kmem_cache_free(btrfs_trans_handle_cachep, h); 351 350 return ERR_PTR(ret); 352 351 } ··· 550 547 } 551 548 btrfs_trans_release_metadata(trans, root); 552 549 trans->block_rsv = NULL; 550 + 551 + sb_end_intwrite(root->fs_info->sb); 553 552 554 553 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 555 554 should_end_transaction(trans, root)) { ··· 1582 1577 1583 1578 put_transaction(cur_trans); 1584 1579 put_transaction(cur_trans); 1580 + 1581 + sb_end_intwrite(root->fs_info->sb); 1585 1582 1586 1583 trace_btrfs_transaction_commit(root); 1587 1584

+10 -18

fs/buffer.c

··· 2306 2306 * beyond EOF, then the page is guaranteed safe against truncation until we 2307 2307 * unlock the page. 2308 2308 * 2309 - * Direct callers of this function should call vfs_check_frozen() so that page 2310 - * fault does not busyloop until the fs is thawed. 2309 + * Direct callers of this function should protect against filesystem freezing 2310 + * using sb_start_write() - sb_end_write() functions. 2311 2311 */ 2312 2312 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2313 2313 get_block_t get_block) ··· 2317 2317 unsigned long end; 2318 2318 loff_t size; 2319 2319 int ret; 2320 + 2321 + /* 2322 + * Update file times before taking page lock. We may end up failing the 2323 + * fault so this update may be superfluous but who really cares... 2324 + */ 2325 + file_update_time(vma->vm_file); 2320 2326 2321 2327 lock_page(page); 2322 2328 size = i_size_read(inode); ··· 2345 2339 2346 2340 if (unlikely(ret < 0)) 2347 2341 goto out_unlock; 2348 - /* 2349 - * Freezing in progress? We check after the page is marked dirty and 2350 - * with page lock held so if the test here fails, we are sure freezing 2351 - * code will wait during syncing until the page fault is done - at that 2352 - * point page will be dirty and unlocked so freezing code will write it 2353 - * and writeprotect it again. 2354 - */ 2355 2342 set_page_dirty(page); 2356 - if (inode->i_sb->s_frozen != SB_UNFROZEN) { 2357 - ret = -EAGAIN; 2358 - goto out_unlock; 2359 - } 2360 2343 wait_on_page_writeback(page); 2361 2344 return 0; 2362 2345 out_unlock: ··· 2360 2365 int ret; 2361 2366 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2362 2367 2363 - /* 2364 - * This check is racy but catches the common case. The check in 2365 - * __block_page_mkwrite() is reliable. 2366 - */ 2367 - vfs_check_frozen(sb, SB_FREEZE_WRITE); 2368 + sb_start_pagefault(sb); 2368 2369 ret = __block_page_mkwrite(vma, vmf, get_block); 2370 + sb_end_pagefault(sb); 2369 2371 return block_page_mkwrite_return(ret); 2370 2372 } 2371 2373 EXPORT_SYMBOL(block_page_mkwrite);

+3

fs/ceph/addr.c

··· 1184 1184 loff_t size, len; 1185 1185 int ret; 1186 1186 1187 + /* Update time before taking page lock */ 1188 + file_update_time(vma->vm_file); 1189 + 1187 1190 size = i_size_read(inode); 1188 1191 if (off + PAGE_CACHE_SIZE <= size) 1189 1192 len = PAGE_CACHE_SIZE;

+10 -20

fs/ecryptfs/inode.c

··· 318 318 struct vfsmount *lower_mnt; 319 319 int rc = 0; 320 320 321 - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 322 - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 323 - BUG_ON(!lower_dentry->d_count); 324 - 325 321 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); 326 - ecryptfs_set_dentry_private(dentry, dentry_info); 327 322 if (!dentry_info) { 328 323 printk(KERN_ERR "%s: Out of memory whilst attempting " 329 324 "to allocate ecryptfs_dentry_info struct\n", 330 325 __func__); 331 326 dput(lower_dentry); 332 - mntput(lower_mnt); 333 - d_drop(dentry); 334 327 return -ENOMEM; 335 328 } 329 + 330 + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 331 + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 332 + BUG_ON(!lower_dentry->d_count); 333 + 334 + ecryptfs_set_dentry_private(dentry, dentry_info); 336 335 ecryptfs_set_dentry_lower(dentry, lower_dentry); 337 336 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 338 337 ··· 380 381 struct dentry *lower_dir_dentry, *lower_dentry; 381 382 int rc = 0; 382 383 383 - if ((ecryptfs_dentry->d_name.len == 1 384 - && !strcmp(ecryptfs_dentry->d_name.name, ".")) 385 - || (ecryptfs_dentry->d_name.len == 2 386 - && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { 387 - goto out_d_drop; 388 - } 389 384 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 390 385 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 391 386 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, ··· 390 397 rc = PTR_ERR(lower_dentry); 391 398 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 392 399 "[%d] on lower_dentry = [%s]\n", __func__, rc, 393 - encrypted_and_encoded_name); 394 - goto out_d_drop; 400 + ecryptfs_dentry->d_name.name); 401 + goto out; 395 402 } 396 403 if (lower_dentry->d_inode) 397 404 goto interpose; ··· 408 415 if (rc) { 409 416 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 410 417 "filename; rc = [%d]\n", __func__, rc); 411 - goto out_d_drop; 418 + goto out; 412 419 } 413 420 mutex_lock(&lower_dir_dentry->d_inode->i_mutex); 414 421 lower_dentry = lookup_one_len(encrypted_and_encoded_name, ··· 420 427 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 421 428 "[%d] on lower_dentry = [%s]\n", __func__, rc, 422 429 encrypted_and_encoded_name); 423 - goto out_d_drop; 430 + goto out; 424 431 } 425 432 interpose: 426 433 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, 427 434 ecryptfs_dir_inode); 428 - goto out; 429 - out_d_drop: 430 - d_drop(ecryptfs_dentry); 431 435 out: 432 436 kfree(encrypted_and_encoded_name); 433 437 return ERR_PTR(rc);

+6 -13

fs/exec.c

··· 2069 2069 */ 2070 2070 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) 2071 2071 { 2072 - struct file *rp, *wp; 2072 + struct file *files[2]; 2073 2073 struct fdtable *fdt; 2074 2074 struct coredump_params *cp = (struct coredump_params *)info->data; 2075 2075 struct files_struct *cf = current->files; 2076 + int err = create_pipe_files(files, 0); 2077 + if (err) 2078 + return err; 2076 2079 2077 - wp = create_write_pipe(0); 2078 - if (IS_ERR(wp)) 2079 - return PTR_ERR(wp); 2080 - 2081 - rp = create_read_pipe(wp, 0); 2082 - if (IS_ERR(rp)) { 2083 - free_write_pipe(wp); 2084 - return PTR_ERR(rp); 2085 - } 2086 - 2087 - cp->file = wp; 2080 + cp->file = files[1]; 2088 2081 2089 2082 sys_close(0); 2090 - fd_install(0, rp); 2083 + fd_install(0, files[0]); 2091 2084 spin_lock(&cf->file_lock); 2092 2085 fdt = files_fdtable(cf); 2093 2086 __set_open_fd(0, fdt);

+4 -1

fs/ext2/inode.c

··· 79 79 truncate_inode_pages(&inode->i_data, 0); 80 80 81 81 if (want_delete) { 82 + sb_start_intwrite(inode->i_sb); 82 83 /* set dtime */ 83 84 EXT2_I(inode)->i_dtime = get_seconds(); 84 85 mark_inode_dirty(inode); ··· 99 98 if (unlikely(rsv)) 100 99 kfree(rsv); 101 100 102 - if (want_delete) 101 + if (want_delete) { 103 102 ext2_free_inode(inode); 103 + sb_end_intwrite(inode->i_sb); 104 + } 104 105 } 105 106 106 107 typedef struct {

+33

fs/ext2/super.c

··· 42 42 static int ext2_remount (struct super_block * sb, int * flags, char * data); 43 43 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); 44 44 static int ext2_sync_fs(struct super_block *sb, int wait); 45 + static int ext2_freeze(struct super_block *sb); 46 + static int ext2_unfreeze(struct super_block *sb); 45 47 46 48 void ext2_error(struct super_block *sb, const char *function, 47 49 const char *fmt, ...) ··· 307 305 .evict_inode = ext2_evict_inode, 308 306 .put_super = ext2_put_super, 309 307 .sync_fs = ext2_sync_fs, 308 + .freeze_fs = ext2_freeze, 309 + .unfreeze_fs = ext2_unfreeze, 310 310 .statfs = ext2_statfs, 311 311 .remount_fs = ext2_remount, 312 312 .show_options = ext2_show_options, ··· 1204 1200 return 0; 1205 1201 } 1206 1202 1203 + static int ext2_freeze(struct super_block *sb) 1204 + { 1205 + struct ext2_sb_info *sbi = EXT2_SB(sb); 1206 + 1207 + /* 1208 + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared 1209 + * because we have unattached inodes and thus filesystem is not fully 1210 + * consistent. 1211 + */ 1212 + if (atomic_long_read(&sb->s_remove_count)) { 1213 + ext2_sync_fs(sb, 1); 1214 + return 0; 1215 + } 1216 + /* Set EXT2_FS_VALID flag */ 1217 + spin_lock(&sbi->s_lock); 1218 + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); 1219 + spin_unlock(&sbi->s_lock); 1220 + ext2_sync_super(sb, sbi->s_es, 1); 1221 + 1222 + return 0; 1223 + } 1224 + 1225 + static int ext2_unfreeze(struct super_block *sb) 1226 + { 1227 + /* Just write sb to clear EXT2_VALID_FS flag */ 1228 + ext2_write_super(sb); 1229 + 1230 + return 0; 1231 + } 1207 1232 1208 1233 void ext2_write_super(struct super_block *sb) 1209 1234 {

+10 -5

fs/ext4/inode.c

··· 233 233 if (is_bad_inode(inode)) 234 234 goto no_delete; 235 235 236 + /* 237 + * Protect us against freezing - iput() caller didn't have to have any 238 + * protection against it 239 + */ 240 + sb_start_intwrite(inode->i_sb); 236 241 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); 237 242 if (IS_ERR(handle)) { 238 243 ext4_std_error(inode->i_sb, PTR_ERR(handle)); ··· 247 242 * cleaned up. 248 243 */ 249 244 ext4_orphan_del(NULL, inode); 245 + sb_end_intwrite(inode->i_sb); 250 246 goto no_delete; 251 247 } 252 248 ··· 279 273 stop_handle: 280 274 ext4_journal_stop(handle); 281 275 ext4_orphan_del(NULL, inode); 276 + sb_end_intwrite(inode->i_sb); 282 277 goto no_delete; 283 278 } 284 279 } ··· 308 301 else 309 302 ext4_free_inode(handle, inode); 310 303 ext4_journal_stop(handle); 304 + sb_end_intwrite(inode->i_sb); 311 305 return; 312 306 no_delete: 313 307 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ ··· 4787 4779 get_block_t *get_block; 4788 4780 int retries = 0; 4789 4781 4790 - /* 4791 - * This check is racy but catches the common case. We rely on 4792 - * __block_page_mkwrite() to do a reliable check. 4793 - */ 4794 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 4782 + sb_start_pagefault(inode->i_sb); 4795 4783 /* Delalloc case is easy... */ 4796 4784 if (test_opt(inode->i_sb, DELALLOC) && 4797 4785 !ext4_should_journal_data(inode) && ··· 4855 4851 out_ret: 4856 4852 ret = block_page_mkwrite_return(ret); 4857 4853 out: 4854 + sb_end_pagefault(inode->i_sb); 4858 4855 return ret; 4859 4856 }

+6

fs/ext4/mmp.c

··· 44 44 { 45 45 struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); 46 46 47 + /* 48 + * We protect against freezing so that we don't create dirty buffers 49 + * on frozen filesystem. 50 + */ 51 + sb_start_write(sb); 47 52 ext4_mmp_csum_set(sb, mmp); 48 53 mark_buffer_dirty(bh); 49 54 lock_buffer(bh); ··· 56 51 get_bh(bh); 57 52 submit_bh(WRITE_SYNC, bh); 58 53 wait_on_buffer(bh); 54 + sb_end_write(sb); 59 55 if (unlikely(!buffer_uptodate(bh))) 60 56 return 1; 61 57

+7 -24

fs/ext4/super.c

··· 331 331 * journal_end calls result in the superblock being marked dirty, so 332 332 * that sync() will call the filesystem's write_super callback if 333 333 * appropriate. 334 - * 335 - * To avoid j_barrier hold in userspace when a user calls freeze(), 336 - * ext4 prevents a new handle from being started by s_frozen, which 337 - * is in an upper layer. 338 334 */ 339 335 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 340 336 { 341 337 journal_t *journal; 342 - handle_t *handle; 343 338 344 339 trace_ext4_journal_start(sb, nblocks, _RET_IP_); 345 340 if (sb->s_flags & MS_RDONLY) 346 341 return ERR_PTR(-EROFS); 347 342 343 + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 348 344 journal = EXT4_SB(sb)->s_journal; 349 - handle = ext4_journal_current_handle(); 350 - 351 - /* 352 - * If a handle has been started, it should be allowed to 353 - * finish, otherwise deadlock could happen between freeze 354 - * and others(e.g. truncate) due to the restart of the 355 - * journal handle if the filesystem is forzen and active 356 - * handles are not stopped. 357 - */ 358 - if (!handle) 359 - vfs_check_frozen(sb, SB_FREEZE_TRANS); 360 - 361 345 if (!journal) 362 346 return ext4_get_nojournal(); 363 347 /* ··· 2731 2747 sb = elr->lr_super; 2732 2748 ngroups = EXT4_SB(sb)->s_groups_count; 2733 2749 2750 + sb_start_write(sb); 2734 2751 for (group = elr->lr_next_group; group < ngroups; group++) { 2735 2752 gdp = ext4_get_group_desc(sb, group, NULL); 2736 2753 if (!gdp) { ··· 2758 2773 elr->lr_next_sched = jiffies + elr->lr_timeout; 2759 2774 elr->lr_next_group = group + 1; 2760 2775 } 2776 + sb_end_write(sb); 2761 2777 2762 2778 return ret; 2763 2779 } ··· 4446 4460 return 0; 4447 4461 4448 4462 journal = EXT4_SB(sb)->s_journal; 4449 - if (journal) { 4450 - vfs_check_frozen(sb, SB_FREEZE_TRANS); 4463 + if (journal) 4451 4464 ret = ext4_journal_force_commit(journal); 4452 - } 4453 4465 4454 4466 return ret; 4455 4467 } ··· 4477 4493 * gives us a chance to flush the journal completely and mark the fs clean. 4478 4494 * 4479 4495 * Note that only this function cannot bring a filesystem to be in a clean 4480 - * state independently, because ext4 prevents a new handle from being started 4481 - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from 4482 - * the upper layer. 4496 + * state independently. It relies on upper layer to stop all data & metadata 4497 + * modifications. 4483 4498 */ 4484 4499 static int ext4_freeze(struct super_block *sb) 4485 4500 { ··· 4505 4522 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4506 4523 error = ext4_commit_super(sb, 1); 4507 4524 out: 4508 - /* we rely on s_frozen to stop further updates */ 4525 + /* we rely on upper layer to stop further updates */ 4509 4526 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4510 4527 return error; 4511 4528 }

+7 -8

fs/fat/file.c

··· 43 43 if (err) 44 44 goto out; 45 45 46 - mutex_lock(&inode->i_mutex); 47 46 err = mnt_want_write_file(file); 48 47 if (err) 49 - goto out_unlock_inode; 48 + goto out; 49 + mutex_lock(&inode->i_mutex); 50 50 51 51 /* 52 52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also ··· 73 73 /* The root directory has no attributes */ 74 74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { 75 75 err = -EINVAL; 76 - goto out_drop_write; 76 + goto out_unlock_inode; 77 77 } 78 78 79 79 if (sbi->options.sys_immutable && 80 80 ((attr | oldattr) & ATTR_SYS) && 81 81 !capable(CAP_LINUX_IMMUTABLE)) { 82 82 err = -EPERM; 83 - goto out_drop_write; 83 + goto out_unlock_inode; 84 84 } 85 85 86 86 /* ··· 90 90 */ 91 91 err = security_inode_setattr(file->f_path.dentry, &ia); 92 92 if (err) 93 - goto out_drop_write; 93 + goto out_unlock_inode; 94 94 95 95 /* This MUST be done before doing anything irreversible... */ 96 96 err = fat_setattr(file->f_path.dentry, &ia); 97 97 if (err) 98 - goto out_drop_write; 98 + goto out_unlock_inode; 99 99 100 100 fsnotify_change(file->f_path.dentry, ia.ia_valid); 101 101 if (sbi->options.sys_immutable) { ··· 107 107 108 108 fat_save_attrs(inode, attr); 109 109 mark_inode_dirty(inode); 110 - out_drop_write: 111 - mnt_drop_write_file(file); 112 110 out_unlock_inode: 113 111 mutex_unlock(&inode->i_mutex); 112 + mnt_drop_write_file(file); 114 113 out: 115 114 return err; 116 115 }

+2 -2

fs/file_table.c

··· 43 43 44 44 static struct percpu_counter nr_files __cacheline_aligned_in_smp; 45 45 46 - static inline void file_free_rcu(struct rcu_head *head) 46 + static void file_free_rcu(struct rcu_head *head) 47 47 { 48 48 struct file *f = container_of(head, struct file, f_u.fu_rcuhead); 49 49 ··· 217 217 return; 218 218 if (file_check_writeable(file) != 0) 219 219 return; 220 - mnt_drop_write(mnt); 220 + __mnt_drop_write(mnt); 221 221 file_release_write(file); 222 222 } 223 223

+2 -2

fs/fuse/file.c

··· 944 944 return err; 945 945 946 946 count = ocount; 947 - 947 + sb_start_write(inode->i_sb); 948 948 mutex_lock(&inode->i_mutex); 949 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 950 949 951 950 /* We can write back this queue in page reclaim */ 952 951 current->backing_dev_info = mapping->backing_dev_info; ··· 1003 1004 out: 1004 1005 current->backing_dev_info = NULL; 1005 1006 mutex_unlock(&inode->i_mutex); 1007 + sb_end_write(inode->i_sb); 1006 1008 1007 1009 return written ? written : err; 1008 1010 }

+6 -12

fs/gfs2/file.c

··· 373 373 loff_t size; 374 374 int ret; 375 375 376 - /* Wait if fs is frozen. This is racy so we check again later on 377 - * and retry if the fs has been frozen after the page lock has 378 - * been acquired 379 - */ 380 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 376 + sb_start_pagefault(inode->i_sb); 377 + 378 + /* Update file times before taking page lock */ 379 + file_update_time(vma->vm_file); 381 380 382 381 ret = gfs2_rs_alloc(ip); 383 382 if (ret) ··· 461 462 gfs2_holder_uninit(&gh); 462 463 if (ret == 0) { 463 464 set_page_dirty(page); 464 - /* This check must be post dropping of transaction lock */ 465 - if (inode->i_sb->s_frozen == SB_UNFROZEN) { 466 - wait_on_page_writeback(page); 467 - } else { 468 - ret = -EAGAIN; 469 - unlock_page(page); 470 - } 465 + wait_on_page_writeback(page); 471 466 } 467 + sb_end_pagefault(inode->i_sb); 472 468 return block_page_mkwrite_return(ret); 473 469 } 474 470

+4

fs/gfs2/trans.c

··· 50 50 if (revokes) 51 51 tr->tr_reserved += gfs2_struct2blk(sdp, revokes, 52 52 sizeof(u64)); 53 + sb_start_intwrite(sdp->sd_vfs); 53 54 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); 54 55 55 56 error = gfs2_glock_nq(&tr->tr_t_gh); ··· 69 68 gfs2_glock_dq(&tr->tr_t_gh); 70 69 71 70 fail_holder_uninit: 71 + sb_end_intwrite(sdp->sd_vfs); 72 72 gfs2_holder_uninit(&tr->tr_t_gh); 73 73 kfree(tr); 74 74 ··· 118 116 gfs2_holder_uninit(&tr->tr_t_gh); 119 117 kfree(tr); 120 118 } 119 + sb_end_intwrite(sdp->sd_vfs); 121 120 return; 122 121 } 123 122 ··· 139 136 140 137 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) 141 138 gfs2_log_flush(sdp, NULL); 139 + sb_end_intwrite(sdp->sd_vfs); 142 140 } 143 141 144 142 /**

+8 -4

fs/inode.c

··· 1542 1542 if (timespec_equal(&inode->i_atime, &now)) 1543 1543 return; 1544 1544 1545 - if (mnt_want_write(mnt)) 1545 + if (!sb_start_write_trylock(inode->i_sb)) 1546 1546 return; 1547 1547 1548 + if (__mnt_want_write(mnt)) 1549 + goto skip_update; 1548 1550 /* 1549 1551 * File systems can error out when updating inodes if they need to 1550 1552 * allocate new space to modify an inode (such is the case for ··· 1557 1555 * of the fs read only, e.g. subvolumes in Btrfs. 1558 1556 */ 1559 1557 update_time(inode, &now, S_ATIME); 1560 - mnt_drop_write(mnt); 1558 + __mnt_drop_write(mnt); 1559 + skip_update: 1560 + sb_end_write(inode->i_sb); 1561 1561 } 1562 1562 EXPORT_SYMBOL(touch_atime); 1563 1563 ··· 1666 1662 return 0; 1667 1663 1668 1664 /* Finally allowed to write? Takes lock. */ 1669 - if (mnt_want_write_file(file)) 1665 + if (__mnt_want_write_file(file)) 1670 1666 return 0; 1671 1667 1672 1668 ret = update_time(inode, &now, sync_it); 1673 - mnt_drop_write_file(file); 1669 + __mnt_drop_write_file(file); 1674 1670 1675 1671 return ret; 1676 1672 }

+4

fs/internal.h

··· 61 61 62 62 extern struct lglock vfsmount_lock; 63 63 64 + extern int __mnt_want_write(struct vfsmount *); 65 + extern int __mnt_want_write_file(struct file *); 66 + extern void __mnt_drop_write(struct vfsmount *); 67 + extern void __mnt_drop_write_file(struct file *); 64 68 65 69 /* 66 70 * fs_struct.c

+7 -7

fs/lockd/clntproc.c

··· 156 156 struct nlm_rqst *call; 157 157 int status; 158 158 159 - nlm_get_host(host); 160 159 call = nlm_alloc_call(host); 161 160 if (call == NULL) 162 161 return -ENOMEM; 163 162 164 163 nlmclnt_locks_init_private(fl, host); 164 + if (!fl->fl_u.nfs_fl.owner) { 165 + /* lockowner allocation has failed */ 166 + nlmclnt_release_call(call); 167 + return -ENOMEM; 168 + } 165 169 /* Set up the argument struct */ 166 170 nlmclnt_setlockargs(call, fl); 167 171 ··· 189 185 190 186 /* 191 187 * Allocate an NLM RPC call struct 192 - * 193 - * Note: the caller must hold a reference to host. In case of failure, 194 - * this reference will be released. 195 188 */ 196 189 struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) 197 190 { ··· 200 199 atomic_set(&call->a_count, 1); 201 200 locks_init_lock(&call->a_args.lock.fl); 202 201 locks_init_lock(&call->a_res.lock.fl); 203 - call->a_host = host; 202 + call->a_host = nlm_get_host(host); 204 203 return call; 205 204 } 206 205 if (signalled()) ··· 208 207 printk("nlm_alloc_call: failed, waiting for memory\n"); 209 208 schedule_timeout_interruptible(5*HZ); 210 209 } 211 - nlmclnt_release_host(host); 212 210 return NULL; 213 211 } 214 212 ··· 750 750 dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" 751 751 " Attempting to cancel lock.\n"); 752 752 753 - req = nlm_alloc_call(nlm_get_host(host)); 753 + req = nlm_alloc_call(host); 754 754 if (!req) 755 755 return -ENOMEM; 756 756 req->a_flags = RPC_TASK_ASYNC;

+1

fs/lockd/svc4proc.c

··· 257 257 return rpc_system_err; 258 258 259 259 call = nlm_alloc_call(host); 260 + nlmsvc_release_host(host); 260 261 if (call == NULL) 261 262 return rpc_system_err; 262 263

-1

fs/lockd/svclock.c

··· 219 219 struct nlm_block *block; 220 220 struct nlm_rqst *call = NULL; 221 221 222 - nlm_get_host(host); 223 222 call = nlm_alloc_call(host); 224 223 if (call == NULL) 225 224 return NULL;

+1

fs/lockd/svcproc.c

··· 297 297 return rpc_system_err; 298 298 299 299 call = nlm_alloc_call(host); 300 + nlmsvc_release_host(host); 300 301 if (call == NULL) 301 302 return rpc_system_err; 302 303

+213 -100

fs/namei.c

··· 650 650 path_put(link); 651 651 } 652 652 653 + int sysctl_protected_symlinks __read_mostly = 1; 654 + int sysctl_protected_hardlinks __read_mostly = 1; 655 + 656 + /** 657 + * may_follow_link - Check symlink following for unsafe situations 658 + * @link: The path of the symlink 659 + * 660 + * In the case of the sysctl_protected_symlinks sysctl being enabled, 661 + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is 662 + * in a sticky world-writable directory. This is to protect privileged 663 + * processes from failing races against path names that may change out 664 + * from under them by way of other users creating malicious symlinks. 665 + * It will permit symlinks to be followed only when outside a sticky 666 + * world-writable directory, or when the uid of the symlink and follower 667 + * match, or when the directory owner matches the symlink's owner. 668 + * 669 + * Returns 0 if following the symlink is allowed, -ve on error. 670 + */ 671 + static inline int may_follow_link(struct path *link, struct nameidata *nd) 672 + { 673 + const struct inode *inode; 674 + const struct inode *parent; 675 + 676 + if (!sysctl_protected_symlinks) 677 + return 0; 678 + 679 + /* Allowed if owner and follower match. */ 680 + inode = link->dentry->d_inode; 681 + if (current_cred()->fsuid == inode->i_uid) 682 + return 0; 683 + 684 + /* Allowed if parent directory not sticky and world-writable. */ 685 + parent = nd->path.dentry->d_inode; 686 + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) 687 + return 0; 688 + 689 + /* Allowed if parent directory and link owner match. */ 690 + if (parent->i_uid == inode->i_uid) 691 + return 0; 692 + 693 + path_put_conditional(link, nd); 694 + path_put(&nd->path); 695 + audit_log_link_denied("follow_link", link); 696 + return -EACCES; 697 + } 698 + 699 + /** 700 + * safe_hardlink_source - Check for safe hardlink conditions 701 + * @inode: the source inode to hardlink from 702 + * 703 + * Return false if at least one of the following conditions: 704 + * - inode is not a regular file 705 + * - inode is setuid 706 + * - inode is setgid and group-exec 707 + * - access failure for read and write 708 + * 709 + * Otherwise returns true. 710 + */ 711 + static bool safe_hardlink_source(struct inode *inode) 712 + { 713 + umode_t mode = inode->i_mode; 714 + 715 + /* Special files should not get pinned to the filesystem. */ 716 + if (!S_ISREG(mode)) 717 + return false; 718 + 719 + /* Setuid files should not get pinned to the filesystem. */ 720 + if (mode & S_ISUID) 721 + return false; 722 + 723 + /* Executable setgid files should not get pinned to the filesystem. */ 724 + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) 725 + return false; 726 + 727 + /* Hardlinking to unreadable or unwritable sources is dangerous. */ 728 + if (inode_permission(inode, MAY_READ | MAY_WRITE)) 729 + return false; 730 + 731 + return true; 732 + } 733 + 734 + /** 735 + * may_linkat - Check permissions for creating a hardlink 736 + * @link: the source to hardlink from 737 + * 738 + * Block hardlink when all of: 739 + * - sysctl_protected_hardlinks enabled 740 + * - fsuid does not match inode 741 + * - hardlink source is unsafe (see safe_hardlink_source() above) 742 + * - not CAP_FOWNER 743 + * 744 + * Returns 0 if successful, -ve on error. 745 + */ 746 + static int may_linkat(struct path *link) 747 + { 748 + const struct cred *cred; 749 + struct inode *inode; 750 + 751 + if (!sysctl_protected_hardlinks) 752 + return 0; 753 + 754 + cred = current_cred(); 755 + inode = link->dentry->d_inode; 756 + 757 + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, 758 + * otherwise, it must be a safe source. 759 + */ 760 + if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || 761 + capable(CAP_FOWNER)) 762 + return 0; 763 + 764 + audit_log_link_denied("linkat", link); 765 + return -EPERM; 766 + } 767 + 653 768 static __always_inline int 654 769 follow_link(struct path *link, struct nameidata *nd, void **p) 655 770 { ··· 1933 1818 while (err > 0) { 1934 1819 void *cookie; 1935 1820 struct path link = path; 1821 + err = may_follow_link(&link, nd); 1822 + if (unlikely(err)) 1823 + break; 1936 1824 nd->flags |= LOOKUP_PARENT; 1937 1825 err = follow_link(&link, nd, &cookie); 1938 1826 if (err) ··· 2395 2277 static int atomic_open(struct nameidata *nd, struct dentry *dentry, 2396 2278 struct path *path, struct file *file, 2397 2279 const struct open_flags *op, 2398 - bool *want_write, bool need_lookup, 2280 + bool got_write, bool need_lookup, 2399 2281 int *opened) 2400 2282 { 2401 2283 struct inode *dir = nd->path.dentry->d_inode; ··· 2418 2300 if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) 2419 2301 mode &= ~current_umask(); 2420 2302 2421 - if (open_flag & O_EXCL) { 2303 + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { 2422 2304 open_flag &= ~O_TRUNC; 2423 2305 *opened |= FILE_CREATED; 2424 2306 } ··· 2432 2314 * Another problem is returing the "right" error value (e.g. for an 2433 2315 * O_EXCL open we want to return EEXIST not EROFS). 2434 2316 */ 2435 - if ((open_flag & (O_CREAT | O_TRUNC)) || 2436 - (open_flag & O_ACCMODE) != O_RDONLY) { 2437 - error = mnt_want_write(nd->path.mnt); 2438 - if (!error) { 2439 - *want_write = true; 2440 - } else if (!(open_flag & O_CREAT)) { 2317 + if (((open_flag & (O_CREAT | O_TRUNC)) || 2318 + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { 2319 + if (!(open_flag & O_CREAT)) { 2441 2320 /* 2442 2321 * No O_CREATE -> atomicity not a requirement -> fall 2443 2322 * back to lookup + open ··· 2442 2327 goto no_open; 2443 2328 } else if (open_flag & (O_EXCL | O_TRUNC)) { 2444 2329 /* Fall back and fail with the right error */ 2445 - create_error = error; 2330 + create_error = -EROFS; 2446 2331 goto no_open; 2447 2332 } else { 2448 2333 /* No side effects, safe to clear O_CREAT */ 2449 - create_error = error; 2334 + create_error = -EROFS; 2450 2335 open_flag &= ~O_CREAT; 2451 2336 } 2452 2337 } ··· 2553 2438 static int lookup_open(struct nameidata *nd, struct path *path, 2554 2439 struct file *file, 2555 2440 const struct open_flags *op, 2556 - bool *want_write, int *opened) 2441 + bool got_write, int *opened) 2557 2442 { 2558 2443 struct dentry *dir = nd->path.dentry; 2559 2444 struct inode *dir_inode = dir->d_inode; ··· 2571 2456 goto out_no_open; 2572 2457 2573 2458 if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { 2574 - return atomic_open(nd, dentry, path, file, op, want_write, 2459 + return atomic_open(nd, dentry, path, file, op, got_write, 2575 2460 need_lookup, opened); 2576 2461 } 2577 2462 ··· 2595 2480 * a permanent write count is taken through 2596 2481 * the 'struct file' in finish_open(). 2597 2482 */ 2598 - error = mnt_want_write(nd->path.mnt); 2599 - if (error) 2483 + if (!got_write) { 2484 + error = -EROFS; 2600 2485 goto out_dput; 2601 - *want_write = true; 2486 + } 2602 2487 *opened |= FILE_CREATED; 2603 2488 error = security_path_mknod(&nd->path, dentry, mode, 0); 2604 2489 if (error) ··· 2628 2513 struct dentry *dir = nd->path.dentry; 2629 2514 int open_flag = op->open_flag; 2630 2515 bool will_truncate = (open_flag & O_TRUNC) != 0; 2631 - bool want_write = false; 2516 + bool got_write = false; 2632 2517 int acc_mode = op->acc_mode; 2633 2518 struct inode *inode; 2634 2519 bool symlink_ok = false; ··· 2697 2582 } 2698 2583 2699 2584 retry_lookup: 2585 + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { 2586 + error = mnt_want_write(nd->path.mnt); 2587 + if (!error) 2588 + got_write = true; 2589 + /* 2590 + * do _not_ fail yet - we might not need that or fail with 2591 + * a different error; let lookup_open() decide; we'll be 2592 + * dropping this one anyway. 2593 + */ 2594 + } 2700 2595 mutex_lock(&dir->d_inode->i_mutex); 2701 - error = lookup_open(nd, path, file, op, &want_write, opened); 2596 + error = lookup_open(nd, path, file, op, got_write, opened); 2702 2597 mutex_unlock(&dir->d_inode->i_mutex); 2703 2598 2704 2599 if (error <= 0) { ··· 2733 2608 } 2734 2609 2735 2610 /* 2736 - * It already exists. 2611 + * create/update audit record if it already exists. 2737 2612 */ 2738 - audit_inode(pathname, path->dentry); 2613 + if (path->dentry->d_inode) 2614 + audit_inode(pathname, path->dentry); 2739 2615 2740 2616 /* 2741 2617 * If atomic_open() acquired write access it is dropped now due to 2742 2618 * possible mount and symlink following (this might be optimized away if 2743 2619 * necessary...) 2744 2620 */ 2745 - if (want_write) { 2621 + if (got_write) { 2746 2622 mnt_drop_write(nd->path.mnt); 2747 - want_write = false; 2623 + got_write = false; 2748 2624 } 2749 2625 2750 2626 error = -EEXIST; 2751 - if (open_flag & O_EXCL) 2627 + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) 2752 2628 goto exit_dput; 2753 2629 2754 2630 error = follow_managed(path, nd->flags); ··· 2810 2684 error = mnt_want_write(nd->path.mnt); 2811 2685 if (error) 2812 2686 goto out; 2813 - want_write = true; 2687 + got_write = true; 2814 2688 } 2815 2689 finish_open_created: 2816 2690 error = may_open(&nd->path, acc_mode, open_flag); ··· 2837 2711 goto exit_fput; 2838 2712 } 2839 2713 out: 2840 - if (want_write) 2714 + if (got_write) 2841 2715 mnt_drop_write(nd->path.mnt); 2842 2716 path_put(&save_parent); 2843 2717 terminate_walk(nd); ··· 2861 2735 nd->inode = dir->d_inode; 2862 2736 save_parent.mnt = NULL; 2863 2737 save_parent.dentry = NULL; 2864 - if (want_write) { 2738 + if (got_write) { 2865 2739 mnt_drop_write(nd->path.mnt); 2866 - want_write = false; 2740 + got_write = false; 2867 2741 } 2868 2742 retried = true; 2869 2743 goto retry_lookup; ··· 2903 2777 error = -ELOOP; 2904 2778 break; 2905 2779 } 2780 + error = may_follow_link(&link, nd); 2781 + if (unlikely(error)) 2782 + break; 2906 2783 nd->flags |= LOOKUP_PARENT; 2907 2784 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2908 2785 error = follow_link(&link, nd, &cookie); ··· 2975 2846 { 2976 2847 struct dentry *dentry = ERR_PTR(-EEXIST); 2977 2848 struct nameidata nd; 2849 + int err2; 2978 2850 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2979 2851 if (error) 2980 2852 return ERR_PTR(error); ··· 2989 2859 nd.flags &= ~LOOKUP_PARENT; 2990 2860 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2991 2861 2862 + /* don't fail immediately if it's r/o, at least try to report other errors */ 2863 + err2 = mnt_want_write(nd.path.mnt); 2992 2864 /* 2993 2865 * Do the final lookup. 2994 2866 */ 2995 2867 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2996 2868 dentry = lookup_hash(&nd); 2997 2869 if (IS_ERR(dentry)) 2998 - goto fail; 2870 + goto unlock; 2999 2871 2872 + error = -EEXIST; 3000 2873 if (dentry->d_inode) 3001 - goto eexist; 2874 + goto fail; 3002 2875 /* 3003 2876 * Special case - lookup gave negative, but... we had foo/bar/ 3004 2877 * From the vfs_mknod() POV we just have a negative dentry - ··· 3009 2876 * been asking for (non-existent) directory. -ENOENT for you. 3010 2877 */ 3011 2878 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 3012 - dput(dentry); 3013 - dentry = ERR_PTR(-ENOENT); 2879 + error = -ENOENT; 2880 + goto fail; 2881 + } 2882 + if (unlikely(err2)) { 2883 + error = err2; 3014 2884 goto fail; 3015 2885 } 3016 2886 *path = nd.path; 3017 2887 return dentry; 3018 - eexist: 3019 - dput(dentry); 3020 - dentry = ERR_PTR(-EEXIST); 3021 2888 fail: 2889 + dput(dentry); 2890 + dentry = ERR_PTR(error); 2891 + unlock: 3022 2892 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2893 + if (!err2) 2894 + mnt_drop_write(nd.path.mnt); 3023 2895 out: 3024 2896 path_put(&nd.path); 3025 2897 return dentry; 3026 2898 } 3027 2899 EXPORT_SYMBOL(kern_path_create); 2900 + 2901 + void done_path_create(struct path *path, struct dentry *dentry) 2902 + { 2903 + dput(dentry); 2904 + mutex_unlock(&path->dentry->d_inode->i_mutex); 2905 + mnt_drop_write(path->mnt); 2906 + path_put(path); 2907 + } 2908 + EXPORT_SYMBOL(done_path_create); 3028 2909 3029 2910 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3030 2911 { ··· 3103 2956 struct path path; 3104 2957 int error; 3105 2958 3106 - if (S_ISDIR(mode)) 3107 - return -EPERM; 2959 + error = may_mknod(mode); 2960 + if (error) 2961 + return error; 3108 2962 3109 2963 dentry = user_path_create(dfd, filename, &path, 0); 3110 2964 if (IS_ERR(dentry)) ··· 3113 2965 3114 2966 if (!IS_POSIXACL(path.dentry->d_inode)) 3115 2967 mode &= ~current_umask(); 3116 - error = may_mknod(mode); 3117 - if (error) 3118 - goto out_dput; 3119 - error = mnt_want_write(path.mnt); 3120 - if (error) 3121 - goto out_dput; 3122 2968 error = security_path_mknod(&path, dentry, mode, dev); 3123 2969 if (error) 3124 - goto out_drop_write; 2970 + goto out; 3125 2971 switch (mode & S_IFMT) { 3126 2972 case 0: case S_IFREG: 3127 2973 error = vfs_create(path.dentry->d_inode,dentry,mode,true); ··· 3128 2986 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 3129 2987 break; 3130 2988 } 3131 - out_drop_write: 3132 - mnt_drop_write(path.mnt); 3133 - out_dput: 3134 - dput(dentry); 3135 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3136 - path_put(&path); 3137 - 2989 + out: 2990 + done_path_create(&path, dentry); 3138 2991 return error; 3139 2992 } 3140 2993 ··· 3175 3038 3176 3039 if (!IS_POSIXACL(path.dentry->d_inode)) 3177 3040 mode &= ~current_umask(); 3178 - error = mnt_want_write(path.mnt); 3179 - if (error) 3180 - goto out_dput; 3181 3041 error = security_path_mkdir(&path, dentry, mode); 3182 - if (error) 3183 - goto out_drop_write; 3184 - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3185 - out_drop_write: 3186 - mnt_drop_write(path.mnt); 3187 - out_dput: 3188 - dput(dentry); 3189 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3190 - path_put(&path); 3042 + if (!error) 3043 + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3044 + done_path_create(&path, dentry); 3191 3045 return error; 3192 3046 } 3193 3047 ··· 3272 3144 } 3273 3145 3274 3146 nd.flags &= ~LOOKUP_PARENT; 3147 + error = mnt_want_write(nd.path.mnt); 3148 + if (error) 3149 + goto exit1; 3275 3150 3276 3151 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3277 3152 dentry = lookup_hash(&nd); ··· 3285 3154 error = -ENOENT; 3286 3155 goto exit3; 3287 3156 } 3288 - error = mnt_want_write(nd.path.mnt); 3289 - if (error) 3290 - goto exit3; 3291 3157 error = security_path_rmdir(&nd.path, dentry); 3292 3158 if (error) 3293 - goto exit4; 3159 + goto exit3; 3294 3160 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 3295 - exit4: 3296 - mnt_drop_write(nd.path.mnt); 3297 3161 exit3: 3298 3162 dput(dentry); 3299 3163 exit2: 3300 3164 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3165 + mnt_drop_write(nd.path.mnt); 3301 3166 exit1: 3302 3167 path_put(&nd.path); 3303 3168 putname(name); ··· 3360 3233 goto exit1; 3361 3234 3362 3235 nd.flags &= ~LOOKUP_PARENT; 3236 + error = mnt_want_write(nd.path.mnt); 3237 + if (error) 3238 + goto exit1; 3363 3239 3364 3240 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 3365 3241 dentry = lookup_hash(&nd); ··· 3375 3245 if (!inode) 3376 3246 goto slashes; 3377 3247 ihold(inode); 3378 - error = mnt_want_write(nd.path.mnt); 3379 - if (error) 3380 - goto exit2; 3381 3248 error = security_path_unlink(&nd.path, dentry); 3382 3249 if (error) 3383 - goto exit3; 3250 + goto exit2; 3384 3251 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 3385 - exit3: 3386 - mnt_drop_write(nd.path.mnt); 3387 - exit2: 3252 + exit2: 3388 3253 dput(dentry); 3389 3254 } 3390 3255 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 3391 3256 if (inode) 3392 3257 iput(inode); /* truncate the inode here */ 3258 + mnt_drop_write(nd.path.mnt); 3393 3259 exit1: 3394 3260 path_put(&nd.path); 3395 3261 putname(name); ··· 3450 3324 if (IS_ERR(dentry)) 3451 3325 goto out_putname; 3452 3326 3453 - error = mnt_want_write(path.mnt); 3454 - if (error) 3455 - goto out_dput; 3456 3327 error = security_path_symlink(&path, dentry, from); 3457 - if (error) 3458 - goto out_drop_write; 3459 - error = vfs_symlink(path.dentry->d_inode, dentry, from); 3460 - out_drop_write: 3461 - mnt_drop_write(path.mnt); 3462 - out_dput: 3463 - dput(dentry); 3464 - mutex_unlock(&path.dentry->d_inode->i_mutex); 3465 - path_put(&path); 3328 + if (!error) 3329 + error = vfs_symlink(path.dentry->d_inode, dentry, from); 3330 + done_path_create(&path, dentry); 3466 3331 out_putname: 3467 3332 putname(from); 3468 3333 return error; ··· 3553 3436 error = -EXDEV; 3554 3437 if (old_path.mnt != new_path.mnt) 3555 3438 goto out_dput; 3556 - error = mnt_want_write(new_path.mnt); 3557 - if (error) 3439 + error = may_linkat(&old_path); 3440 + if (unlikely(error)) 3558 3441 goto out_dput; 3559 3442 error = security_path_link(old_path.dentry, &new_path, new_dentry); 3560 3443 if (error) 3561 - goto out_drop_write; 3444 + goto out_dput; 3562 3445 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3563 - out_drop_write: 3564 - mnt_drop_write(new_path.mnt); 3565 3446 out_dput: 3566 - dput(new_dentry); 3567 - mutex_unlock(&new_path.dentry->d_inode->i_mutex); 3568 - path_put(&new_path); 3447 + done_path_create(&new_path, new_dentry); 3569 3448 out: 3570 3449 path_put(&old_path); 3571 3450 ··· 3757 3644 if (newnd.last_type != LAST_NORM) 3758 3645 goto exit2; 3759 3646 3647 + error = mnt_want_write(oldnd.path.mnt); 3648 + if (error) 3649 + goto exit2; 3650 + 3760 3651 oldnd.flags &= ~LOOKUP_PARENT; 3761 3652 newnd.flags &= ~LOOKUP_PARENT; 3762 3653 newnd.flags |= LOOKUP_RENAME_TARGET; ··· 3796 3679 if (new_dentry == trap) 3797 3680 goto exit5; 3798 3681 3799 - error = mnt_want_write(oldnd.path.mnt); 3800 - if (error) 3801 - goto exit5; 3802 3682 error = security_path_rename(&oldnd.path, old_dentry, 3803 3683 &newnd.path, new_dentry); 3804 3684 if (error) 3805 - goto exit6; 3685 + goto exit5; 3806 3686 error = vfs_rename(old_dir->d_inode, old_dentry, 3807 3687 new_dir->d_inode, new_dentry); 3808 - exit6: 3809 - mnt_drop_write(oldnd.path.mnt); 3810 3688 exit5: 3811 3689 dput(new_dentry); 3812 3690 exit4: 3813 3691 dput(old_dentry); 3814 3692 exit3: 3815 3693 unlock_rename(new_dir, old_dir); 3694 + mnt_drop_write(oldnd.path.mnt); 3816 3695 exit2: 3817 3696 path_put(&newnd.path); 3818 3697 putname(to);

+78 -21

fs/namespace.c

··· 283 283 } 284 284 285 285 /* 286 - * Most r/o checks on a fs are for operations that take 287 - * discrete amounts of time, like a write() or unlink(). 288 - * We must keep track of when those operations start 289 - * (for permission checks) and when they end, so that 290 - * we can determine when writes are able to occur to 291 - * a filesystem. 286 + * Most r/o & frozen checks on a fs are for operations that take discrete 287 + * amounts of time, like a write() or unlink(). We must keep track of when 288 + * those operations start (for permission checks) and when they end, so that we 289 + * can determine when writes are able to occur to a filesystem. 292 290 */ 293 291 /** 294 - * mnt_want_write - get write access to a mount 292 + * __mnt_want_write - get write access to a mount without freeze protection 295 293 * @m: the mount on which to take a write 296 294 * 297 - * This tells the low-level filesystem that a write is 298 - * about to be performed to it, and makes sure that 299 - * writes are allowed before returning success. When 300 - * the write operation is finished, mnt_drop_write() 301 - * must be called. This is effectively a refcount. 295 + * This tells the low-level filesystem that a write is about to be performed to 296 + * it, and makes sure that writes are allowed (mnt it read-write) before 297 + * returning success. This operation does not protect against filesystem being 298 + * frozen. When the write operation is finished, __mnt_drop_write() must be 299 + * called. This is effectively a refcount. 302 300 */ 303 - int mnt_want_write(struct vfsmount *m) 301 + int __mnt_want_write(struct vfsmount *m) 304 302 { 305 303 struct mount *mnt = real_mount(m); 306 304 int ret = 0; ··· 324 326 ret = -EROFS; 325 327 } 326 328 preempt_enable(); 329 + 330 + return ret; 331 + } 332 + 333 + /** 334 + * mnt_want_write - get write access to a mount 335 + * @m: the mount on which to take a write 336 + * 337 + * This tells the low-level filesystem that a write is about to be performed to 338 + * it, and makes sure that writes are allowed (mount is read-write, filesystem 339 + * is not frozen) before returning success. When the write operation is 340 + * finished, mnt_drop_write() must be called. This is effectively a refcount. 341 + */ 342 + int mnt_want_write(struct vfsmount *m) 343 + { 344 + int ret; 345 + 346 + sb_start_write(m->mnt_sb); 347 + ret = __mnt_want_write(m); 348 + if (ret) 349 + sb_end_write(m->mnt_sb); 327 350 return ret; 328 351 } 329 352 EXPORT_SYMBOL_GPL(mnt_want_write); ··· 374 355 EXPORT_SYMBOL_GPL(mnt_clone_write); 375 356 376 357 /** 358 + * __mnt_want_write_file - get write access to a file's mount 359 + * @file: the file who's mount on which to take a write 360 + * 361 + * This is like __mnt_want_write, but it takes a file and can 362 + * do some optimisations if the file is open for write already 363 + */ 364 + int __mnt_want_write_file(struct file *file) 365 + { 366 + struct inode *inode = file->f_dentry->d_inode; 367 + 368 + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 369 + return __mnt_want_write(file->f_path.mnt); 370 + else 371 + return mnt_clone_write(file->f_path.mnt); 372 + } 373 + 374 + /** 377 375 * mnt_want_write_file - get write access to a file's mount 378 376 * @file: the file who's mount on which to take a write 379 377 * ··· 399 363 */ 400 364 int mnt_want_write_file(struct file *file) 401 365 { 402 - struct inode *inode = file->f_dentry->d_inode; 403 - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 404 - return mnt_want_write(file->f_path.mnt); 405 - else 406 - return mnt_clone_write(file->f_path.mnt); 366 + int ret; 367 + 368 + sb_start_write(file->f_path.mnt->mnt_sb); 369 + ret = __mnt_want_write_file(file); 370 + if (ret) 371 + sb_end_write(file->f_path.mnt->mnt_sb); 372 + return ret; 407 373 } 408 374 EXPORT_SYMBOL_GPL(mnt_want_write_file); 409 375 410 376 /** 411 - * mnt_drop_write - give up write access to a mount 377 + * __mnt_drop_write - give up write access to a mount 412 378 * @mnt: the mount on which to give up write access 413 379 * 414 380 * Tells the low-level filesystem that we are done 415 381 * performing writes to it. Must be matched with 416 - * mnt_want_write() call above. 382 + * __mnt_want_write() call above. 417 383 */ 418 - void mnt_drop_write(struct vfsmount *mnt) 384 + void __mnt_drop_write(struct vfsmount *mnt) 419 385 { 420 386 preempt_disable(); 421 387 mnt_dec_writers(real_mount(mnt)); 422 388 preempt_enable(); 423 389 } 390 + 391 + /** 392 + * mnt_drop_write - give up write access to a mount 393 + * @mnt: the mount on which to give up write access 394 + * 395 + * Tells the low-level filesystem that we are done performing writes to it and 396 + * also allows filesystem to be frozen again. Must be matched with 397 + * mnt_want_write() call above. 398 + */ 399 + void mnt_drop_write(struct vfsmount *mnt) 400 + { 401 + __mnt_drop_write(mnt); 402 + sb_end_write(mnt->mnt_sb); 403 + } 424 404 EXPORT_SYMBOL_GPL(mnt_drop_write); 405 + 406 + void __mnt_drop_write_file(struct file *file) 407 + { 408 + __mnt_drop_write(file->f_path.mnt); 409 + } 425 410 426 411 void mnt_drop_write_file(struct file *file) 427 412 {

+5 -4

fs/nfsd/nfs4recover.c

··· 154 154 if (status < 0) 155 155 return; 156 156 157 + status = mnt_want_write_file(rec_file); 158 + if (status) 159 + return; 160 + 157 161 dir = rec_file->f_path.dentry; 158 162 /* lock the parent */ 159 163 mutex_lock(&dir->d_inode->i_mutex); ··· 177 173 * as well be forgiving and just succeed silently. 178 174 */ 179 175 goto out_put; 180 - status = mnt_want_write_file(rec_file); 181 - if (status) 182 - goto out_put; 183 176 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); 184 - mnt_drop_write_file(rec_file); 185 177 out_put: 186 178 dput(dentry); 187 179 out_unlock: ··· 189 189 " (err %d); please check that %s exists" 190 190 " and is writeable", status, 191 191 user_recovery_dirname); 192 + mnt_drop_write_file(rec_file); 192 193 nfs4_reset_creds(original_cred); 193 194 } 194 195

+1

fs/nfsd/nfsfh.c

··· 635 635 fhp->fh_post_saved = 0; 636 636 #endif 637 637 } 638 + fh_drop_write(fhp); 638 639 if (exp) { 639 640 exp_put(exp); 640 641 fhp->fh_export = NULL;

+8 -1

fs/nfsd/nfsproc.c

··· 196 196 struct dentry *dchild; 197 197 int type, mode; 198 198 __be32 nfserr; 199 + int hosterr; 199 200 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); 200 201 201 202 dprintk("nfsd: CREATE %s %.*s\n", ··· 215 214 nfserr = nfserr_exist; 216 215 if (isdotent(argp->name, argp->len)) 217 216 goto done; 217 + hosterr = fh_want_write(dirfhp); 218 + if (hosterr) { 219 + nfserr = nfserrno(hosterr); 220 + goto done; 221 + } 222 + 218 223 fh_lock_nested(dirfhp, I_MUTEX_PARENT); 219 224 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); 220 225 if (IS_ERR(dchild)) { ··· 337 330 out_unlock: 338 331 /* We don't really need to unlock, as fh_put does it. */ 339 332 fh_unlock(dirfhp); 340 - 333 + fh_drop_write(dirfhp); 341 334 done: 342 335 fh_put(dirfhp); 343 336 return nfsd_return_dirop(nfserr, resp);

+40 -39

fs/nfsd/vfs.c

··· 1284 1284 * If it has, the parent directory should already be locked. 1285 1285 */ 1286 1286 if (!resfhp->fh_dentry) { 1287 + host_err = fh_want_write(fhp); 1288 + if (host_err) 1289 + goto out_nfserr; 1290 + 1287 1291 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1288 1292 fh_lock_nested(fhp, I_MUTEX_PARENT); 1289 1293 dchild = lookup_one_len(fname, dentry, flen); ··· 1331 1327 goto out; 1332 1328 } 1333 1329 1334 - host_err = fh_want_write(fhp); 1335 - if (host_err) 1336 - goto out_nfserr; 1337 - 1338 1330 /* 1339 1331 * Get the dir op function pointer. 1340 1332 */ 1341 1333 err = 0; 1334 + host_err = 0; 1342 1335 switch (type) { 1343 1336 case S_IFREG: 1344 1337 host_err = vfs_create(dirp, dchild, iap->ia_mode, true); ··· 1352 1351 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1353 1352 break; 1354 1353 } 1355 - if (host_err < 0) { 1356 - fh_drop_write(fhp); 1354 + if (host_err < 0) 1357 1355 goto out_nfserr; 1358 - } 1359 1356 1360 1357 err = nfsd_create_setattr(rqstp, resfhp, iap); 1361 1358 ··· 1365 1366 err2 = nfserrno(commit_metadata(fhp)); 1366 1367 if (err2) 1367 1368 err = err2; 1368 - fh_drop_write(fhp); 1369 1369 /* 1370 1370 * Update the file handle to get the new inode info. 1371 1371 */ ··· 1423 1425 err = nfserr_notdir; 1424 1426 if (!dirp->i_op->lookup) 1425 1427 goto out; 1428 + 1429 + host_err = fh_want_write(fhp); 1430 + if (host_err) 1431 + goto out_nfserr; 1432 + 1426 1433 fh_lock_nested(fhp, I_MUTEX_PARENT); 1427 1434 1428 1435 /* ··· 1460 1457 v_atime = verifier[1]&0x7fffffff; 1461 1458 } 1462 1459 1463 - host_err = fh_want_write(fhp); 1464 - if (host_err) 1465 - goto out_nfserr; 1466 1460 if (dchild->d_inode) { 1467 1461 err = 0; 1468 1462 ··· 1530 1530 if (!err) 1531 1531 err = nfserrno(commit_metadata(fhp)); 1532 1532 1533 - fh_drop_write(fhp); 1534 1533 /* 1535 1534 * Update the filehandle to get the new inode info. 1536 1535 */ ··· 1540 1541 fh_unlock(fhp); 1541 1542 if (dchild && !IS_ERR(dchild)) 1542 1543 dput(dchild); 1544 + fh_drop_write(fhp); 1543 1545 return err; 1544 1546 1545 1547 out_nfserr: ··· 1621 1621 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); 1622 1622 if (err) 1623 1623 goto out; 1624 + 1625 + host_err = fh_want_write(fhp); 1626 + if (host_err) 1627 + goto out_nfserr; 1628 + 1624 1629 fh_lock(fhp); 1625 1630 dentry = fhp->fh_dentry; 1626 1631 dnew = lookup_one_len(fname, dentry, flen); 1627 1632 host_err = PTR_ERR(dnew); 1628 1633 if (IS_ERR(dnew)) 1629 - goto out_nfserr; 1630 - 1631 - host_err = fh_want_write(fhp); 1632 - if (host_err) 1633 1634 goto out_nfserr; 1634 1635 1635 1636 if (unlikely(path[plen] != 0)) { ··· 1692 1691 if (isdotent(name, len)) 1693 1692 goto out; 1694 1693 1694 + host_err = fh_want_write(tfhp); 1695 + if (host_err) { 1696 + err = nfserrno(host_err); 1697 + goto out; 1698 + } 1699 + 1695 1700 fh_lock_nested(ffhp, I_MUTEX_PARENT); 1696 1701 ddir = ffhp->fh_dentry; 1697 1702 dirp = ddir->d_inode; ··· 1709 1702 1710 1703 dold = tfhp->fh_dentry; 1711 1704 1712 - host_err = fh_want_write(tfhp); 1713 - if (host_err) { 1714 - err = nfserrno(host_err); 1715 - goto out_dput; 1716 - } 1717 1705 err = nfserr_noent; 1718 1706 if (!dold->d_inode) 1719 - goto out_drop_write; 1707 + goto out_dput; 1720 1708 host_err = nfsd_break_lease(dold->d_inode); 1721 1709 if (host_err) { 1722 1710 err = nfserrno(host_err); 1723 - goto out_drop_write; 1711 + goto out_dput; 1724 1712 } 1725 1713 host_err = vfs_link(dold, dirp, dnew); 1726 1714 if (!host_err) { ··· 1728 1726 else 1729 1727 err = nfserrno(host_err); 1730 1728 } 1731 - out_drop_write: 1732 - fh_drop_write(tfhp); 1733 1729 out_dput: 1734 1730 dput(dnew); 1735 1731 out_unlock: 1736 1732 fh_unlock(ffhp); 1733 + fh_drop_write(tfhp); 1737 1734 out: 1738 1735 return err; 1739 1736 ··· 1775 1774 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) 1776 1775 goto out; 1777 1776 1777 + host_err = fh_want_write(ffhp); 1778 + if (host_err) { 1779 + err = nfserrno(host_err); 1780 + goto out; 1781 + } 1782 + 1778 1783 /* cannot use fh_lock as we need deadlock protective ordering 1779 1784 * so do it by hand */ 1780 1785 trap = lock_rename(tdentry, fdentry); ··· 1811 1804 host_err = -EXDEV; 1812 1805 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1813 1806 goto out_dput_new; 1814 - host_err = fh_want_write(ffhp); 1815 - if (host_err) 1816 - goto out_dput_new; 1817 1807 1818 1808 host_err = nfsd_break_lease(odentry->d_inode); 1819 1809 if (host_err) 1820 - goto out_drop_write; 1810 + goto out_dput_new; 1821 1811 if (ndentry->d_inode) { 1822 1812 host_err = nfsd_break_lease(ndentry->d_inode); 1823 1813 if (host_err) 1824 - goto out_drop_write; 1814 + goto out_dput_new; 1825 1815 } 1826 1816 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1827 1817 if (!host_err) { ··· 1826 1822 if (!host_err) 1827 1823 host_err = commit_metadata(ffhp); 1828 1824 } 1829 - out_drop_write: 1830 - fh_drop_write(ffhp); 1831 1825 out_dput_new: 1832 1826 dput(ndentry); 1833 1827 out_dput_old: ··· 1841 1839 fill_post_wcc(tfhp); 1842 1840 unlock_rename(tdentry, fdentry); 1843 1841 ffhp->fh_locked = tfhp->fh_locked = 0; 1842 + fh_drop_write(ffhp); 1844 1843 1845 1844 out: 1846 1845 return err; ··· 1867 1864 if (err) 1868 1865 goto out; 1869 1866 1867 + host_err = fh_want_write(fhp); 1868 + if (host_err) 1869 + goto out_nfserr; 1870 + 1870 1871 fh_lock_nested(fhp, I_MUTEX_PARENT); 1871 1872 dentry = fhp->fh_dentry; 1872 1873 dirp = dentry->d_inode; ··· 1889 1882 if (!type) 1890 1883 type = rdentry->d_inode->i_mode & S_IFMT; 1891 1884 1892 - host_err = fh_want_write(fhp); 1893 - if (host_err) 1894 - goto out_put; 1895 - 1896 1885 host_err = nfsd_break_lease(rdentry->d_inode); 1897 1886 if (host_err) 1898 - goto out_drop_write; 1887 + goto out_put; 1899 1888 if (type != S_IFDIR) 1900 1889 host_err = vfs_unlink(dirp, rdentry); 1901 1890 else 1902 1891 host_err = vfs_rmdir(dirp, rdentry); 1903 1892 if (!host_err) 1904 1893 host_err = commit_metadata(fhp); 1905 - out_drop_write: 1906 - fh_drop_write(fhp); 1907 1894 out_put: 1908 1895 dput(rdentry); 1909 1896

+9 -2

fs/nfsd/vfs.h

··· 110 110 111 111 static inline int fh_want_write(struct svc_fh *fh) 112 112 { 113 - return mnt_want_write(fh->fh_export->ex_path.mnt); 113 + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); 114 + 115 + if (!ret) 116 + fh->fh_want_write = 1; 117 + return ret; 114 118 } 115 119 116 120 static inline void fh_drop_write(struct svc_fh *fh) 117 121 { 118 - mnt_drop_write(fh->fh_export->ex_path.mnt); 122 + if (fh->fh_want_write) { 123 + fh->fh_want_write = 0; 124 + mnt_drop_write(fh->fh_export->ex_path.mnt); 125 + } 119 126 } 120 127 121 128 #endif /* LINUX_NFSD_VFS_H */

+11 -7

fs/nilfs2/file.c

··· 69 69 struct page *page = vmf->page; 70 70 struct inode *inode = vma->vm_file->f_dentry->d_inode; 71 71 struct nilfs_transaction_info ti; 72 - int ret; 72 + int ret = 0; 73 73 74 74 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) 75 75 return VM_FAULT_SIGBUS; /* -ENOSPC */ 76 76 77 + sb_start_pagefault(inode->i_sb); 77 78 lock_page(page); 78 79 if (page->mapping != inode->i_mapping || 79 80 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { 80 81 unlock_page(page); 81 - return VM_FAULT_NOPAGE; /* make the VM retry the fault */ 82 + ret = -EFAULT; /* make the VM retry the fault */ 83 + goto out; 82 84 } 83 85 84 86 /* ··· 114 112 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); 115 113 /* never returns -ENOMEM, but may return -ENOSPC */ 116 114 if (unlikely(ret)) 117 - return VM_FAULT_SIGBUS; 115 + goto out; 118 116 119 - ret = block_page_mkwrite(vma, vmf, nilfs_get_block); 120 - if (ret != VM_FAULT_LOCKED) { 117 + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); 118 + if (ret) { 121 119 nilfs_transaction_abort(inode->i_sb); 122 - return ret; 120 + goto out; 123 121 } 124 122 nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); 125 123 nilfs_transaction_commit(inode->i_sb); 126 124 127 125 mapped: 128 126 wait_on_page_writeback(page); 129 - return VM_FAULT_LOCKED; 127 + out: 128 + sb_end_pagefault(inode->i_sb); 129 + return block_page_mkwrite_return(ret); 130 130 } 131 131 132 132 static const struct vm_operations_struct nilfs_file_vm_ops = {

-2

fs/nilfs2/ioctl.c

··· 660 660 goto out_free; 661 661 } 662 662 663 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 664 - 665 663 ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); 666 664 if (ret < 0) 667 665 printk(KERN_ERR "NILFS: GC failed during preparation: "

+4 -1

fs/nilfs2/segment.c

··· 189 189 if (ret > 0) 190 190 return 0; 191 191 192 - vfs_check_frozen(sb, SB_FREEZE_WRITE); 192 + sb_start_intwrite(sb); 193 193 194 194 nilfs = sb->s_fs_info; 195 195 down_read(&nilfs->ns_segctor_sem); ··· 205 205 current->journal_info = ti->ti_save; 206 206 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 207 207 kmem_cache_free(nilfs_transaction_cachep, ti); 208 + sb_end_intwrite(sb); 208 209 return ret; 209 210 } 210 211 ··· 247 246 err = nilfs_construct_segment(sb); 248 247 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 249 248 kmem_cache_free(nilfs_transaction_cachep, ti); 249 + sb_end_intwrite(sb); 250 250 return err; 251 251 } 252 252 ··· 266 264 current->journal_info = ti->ti_save; 267 265 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 268 266 kmem_cache_free(nilfs_transaction_cachep, ti); 267 + sb_end_intwrite(sb); 269 268 } 270 269 271 270 void nilfs_relax_pressure_in_lock(struct super_block *sb)

+2 -1

fs/ntfs/file.c

··· 2084 2084 if (err) 2085 2085 return err; 2086 2086 pos = *ppos; 2087 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2088 2087 /* We can write back this queue in page reclaim. */ 2089 2088 current->backing_dev_info = mapping->backing_dev_info; 2090 2089 written = 0; ··· 2118 2119 2119 2120 BUG_ON(iocb->ki_pos != pos); 2120 2121 2122 + sb_start_write(inode->i_sb); 2121 2123 mutex_lock(&inode->i_mutex); 2122 2124 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2123 2125 mutex_unlock(&inode->i_mutex); ··· 2127 2127 if (err < 0) 2128 2128 ret = err; 2129 2129 } 2130 + sb_end_write(inode->i_sb); 2130 2131 return ret; 2131 2132 } 2132 2133

+9 -2

fs/ocfs2/file.c

··· 1971 1971 { 1972 1972 struct inode *inode = file->f_path.dentry->d_inode; 1973 1973 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1974 + int ret; 1974 1975 1975 1976 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1976 1977 !ocfs2_writes_unwritten_extents(osb)) ··· 1986 1985 if (!(file->f_mode & FMODE_WRITE)) 1987 1986 return -EBADF; 1988 1987 1989 - return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1988 + ret = mnt_want_write_file(file); 1989 + if (ret) 1990 + return ret; 1991 + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1992 + mnt_drop_write_file(file); 1993 + return ret; 1990 1994 } 1991 1995 1992 1996 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, ··· 2267 2261 if (iocb->ki_left == 0) 2268 2262 return 0; 2269 2263 2270 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2264 + sb_start_write(inode->i_sb); 2271 2265 2272 2266 appending = file->f_flags & O_APPEND ? 1 : 0; 2273 2267 direct_io = file->f_flags & O_DIRECT ? 1 : 0; ··· 2442 2436 ocfs2_iocb_clear_sem_locked(iocb); 2443 2437 2444 2438 mutex_unlock(&inode->i_mutex); 2439 + sb_end_write(inode->i_sb); 2445 2440 2446 2441 if (written) 2447 2442 ret = written;

+12 -2

fs/ocfs2/ioctl.c

··· 928 928 if (get_user(new_clusters, (int __user *)arg)) 929 929 return -EFAULT; 930 930 931 - return ocfs2_group_extend(inode, new_clusters); 931 + status = mnt_want_write_file(filp); 932 + if (status) 933 + return status; 934 + status = ocfs2_group_extend(inode, new_clusters); 935 + mnt_drop_write_file(filp); 936 + return status; 932 937 case OCFS2_IOC_GROUP_ADD: 933 938 case OCFS2_IOC_GROUP_ADD64: 934 939 if (!capable(CAP_SYS_RESOURCE)) ··· 942 937 if (copy_from_user(&input, (int __user *) arg, sizeof(input))) 943 938 return -EFAULT; 944 939 945 - return ocfs2_group_add(inode, &input); 940 + status = mnt_want_write_file(filp); 941 + if (status) 942 + return status; 943 + status = ocfs2_group_add(inode, &input); 944 + mnt_drop_write_file(filp); 945 + return status; 946 946 case OCFS2_IOC_REFLINK: 947 947 if (copy_from_user(&args, argp, sizeof(args))) 948 948 return -EFAULT;

+6 -1

fs/ocfs2/journal.c

··· 355 355 if (journal_current_handle()) 356 356 return jbd2_journal_start(journal, max_buffs); 357 357 358 + sb_start_intwrite(osb->sb); 359 + 358 360 down_read(&osb->journal->j_trans_barrier); 359 361 360 362 handle = jbd2_journal_start(journal, max_buffs); 361 363 if (IS_ERR(handle)) { 362 364 up_read(&osb->journal->j_trans_barrier); 365 + sb_end_intwrite(osb->sb); 363 366 364 367 mlog_errno(PTR_ERR(handle)); 365 368 ··· 391 388 if (ret < 0) 392 389 mlog_errno(ret); 393 390 394 - if (!nested) 391 + if (!nested) { 395 392 up_read(&journal->j_trans_barrier); 393 + sb_end_intwrite(osb->sb); 394 + } 396 395 397 396 return ret; 398 397 }

+2

fs/ocfs2/mmap.c

··· 136 136 sigset_t oldset; 137 137 int ret; 138 138 139 + sb_start_pagefault(inode->i_sb); 139 140 ocfs2_block_signals(&oldset); 140 141 141 142 /* ··· 166 165 167 166 out: 168 167 ocfs2_unblock_signals(&oldset); 168 + sb_end_pagefault(inode->i_sb); 169 169 return ret; 170 170 } 171 171

+1 -10

fs/ocfs2/refcounttree.c

··· 4466 4466 goto out_dput; 4467 4467 } 4468 4468 4469 - error = mnt_want_write(new_path.mnt); 4470 - if (error) { 4471 - mlog_errno(error); 4472 - goto out_dput; 4473 - } 4474 - 4475 4469 error = ocfs2_vfs_reflink(old_path.dentry, 4476 4470 new_path.dentry->d_inode, 4477 4471 new_dentry, preserve); 4478 - mnt_drop_write(new_path.mnt); 4479 4472 out_dput: 4480 - dput(new_dentry); 4481 - mutex_unlock(&new_path.dentry->d_inode->i_mutex); 4482 - path_put(&new_path); 4473 + done_path_create(&new_path, new_dentry); 4483 4474 out: 4484 4475 path_put(&old_path); 4485 4476

+9 -6

fs/open.c

··· 164 164 if (IS_APPEND(inode)) 165 165 goto out_putf; 166 166 167 + sb_start_write(inode->i_sb); 167 168 error = locks_verify_truncate(inode, file, length); 168 169 if (!error) 169 170 error = security_path_truncate(&file->f_path); 170 171 if (!error) 171 172 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 173 + sb_end_write(inode->i_sb); 172 174 out_putf: 173 175 fput(file); 174 176 out: ··· 268 266 if (!file->f_op->fallocate) 269 267 return -EOPNOTSUPP; 270 268 271 - return file->f_op->fallocate(file, mode, offset, len); 269 + sb_start_write(inode->i_sb); 270 + ret = file->f_op->fallocate(file, mode, offset, len); 271 + sb_end_write(inode->i_sb); 272 + return ret; 272 273 } 273 274 274 275 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) ··· 625 620 /* 626 621 * Balanced in __fput() 627 622 */ 628 - error = mnt_want_write(mnt); 623 + error = __mnt_want_write(mnt); 629 624 if (error) 630 625 put_write_access(inode); 631 626 } ··· 659 654 if (unlikely(f->f_flags & O_PATH)) 660 655 f->f_mode = FMODE_PATH; 661 656 657 + path_get(&f->f_path); 662 658 inode = f->f_path.dentry->d_inode; 663 659 if (f->f_mode & FMODE_WRITE) { 664 660 error = __get_file_write_access(inode, f->f_path.mnt); ··· 745 739 int error; 746 740 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ 747 741 748 - mntget(file->f_path.mnt); 749 - file->f_path.dentry = dget(dentry); 750 - 742 + file->f_path.dentry = dentry; 751 743 error = do_dentry_open(file, open, current_cred()); 752 744 if (!error) 753 745 *opened |= FILE_OPENED; ··· 788 784 789 785 f->f_flags = flags; 790 786 f->f_path = *path; 791 - path_get(&f->f_path); 792 787 error = do_dentry_open(f, NULL, cred); 793 788 if (!error) { 794 789 error = open_check_o_direct(f);

+26 -49

fs/pipe.c

··· 1016 1016 return NULL; 1017 1017 } 1018 1018 1019 - struct file *create_write_pipe(int flags) 1019 + int create_pipe_files(struct file **res, int flags) 1020 1020 { 1021 1021 int err; 1022 - struct inode *inode; 1022 + struct inode *inode = get_pipe_inode(); 1023 1023 struct file *f; 1024 1024 struct path path; 1025 - struct qstr name = { .name = "" }; 1025 + static struct qstr name = { .name = "" }; 1026 1026 1027 - err = -ENFILE; 1028 - inode = get_pipe_inode(); 1029 1027 if (!inode) 1030 - goto err; 1028 + return -ENFILE; 1031 1029 1032 1030 err = -ENOMEM; 1033 1031 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); ··· 1039 1041 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1040 1042 if (!f) 1041 1043 goto err_dentry; 1042 - f->f_mapping = inode->i_mapping; 1043 1044 1044 1045 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1045 - f->f_version = 0; 1046 1046 1047 - return f; 1047 + res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); 1048 + if (!res[0]) 1049 + goto err_file; 1048 1050 1049 - err_dentry: 1051 + path_get(&path); 1052 + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1053 + res[1] = f; 1054 + return 0; 1055 + 1056 + err_file: 1057 + put_filp(f); 1058 + err_dentry: 1050 1059 free_pipe_info(inode); 1051 1060 path_put(&path); 1052 - return ERR_PTR(err); 1061 + return err; 1053 1062 1054 - err_inode: 1063 + err_inode: 1055 1064 free_pipe_info(inode); 1056 1065 iput(inode); 1057 - err: 1058 - return ERR_PTR(err); 1059 - } 1060 - 1061 - void free_write_pipe(struct file *f) 1062 - { 1063 - free_pipe_info(f->f_dentry->d_inode); 1064 - path_put(&f->f_path); 1065 - put_filp(f); 1066 - } 1067 - 1068 - struct file *create_read_pipe(struct file *wrf, int flags) 1069 - { 1070 - /* Grab pipe from the writer */ 1071 - struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1072 - &read_pipefifo_fops); 1073 - if (!f) 1074 - return ERR_PTR(-ENFILE); 1075 - 1076 - path_get(&wrf->f_path); 1077 - f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1078 - 1079 - return f; 1066 + return err; 1080 1067 } 1081 1068 1082 1069 int do_pipe_flags(int *fd, int flags) 1083 1070 { 1084 - struct file *fw, *fr; 1071 + struct file *files[2]; 1085 1072 int error; 1086 1073 int fdw, fdr; 1087 1074 1088 1075 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1089 1076 return -EINVAL; 1090 1077 1091 - fw = create_write_pipe(flags); 1092 - if (IS_ERR(fw)) 1093 - return PTR_ERR(fw); 1094 - fr = create_read_pipe(fw, flags); 1095 - error = PTR_ERR(fr); 1096 - if (IS_ERR(fr)) 1097 - goto err_write_pipe; 1078 + error = create_pipe_files(files, flags); 1079 + if (error) 1080 + return error; 1098 1081 1099 1082 error = get_unused_fd_flags(flags); 1100 1083 if (error < 0) ··· 1088 1109 fdw = error; 1089 1110 1090 1111 audit_fd_pair(fdr, fdw); 1091 - fd_install(fdr, fr); 1092 - fd_install(fdw, fw); 1112 + fd_install(fdr, files[0]); 1113 + fd_install(fdw, files[1]); 1093 1114 fd[0] = fdr; 1094 1115 fd[1] = fdw; 1095 1116 ··· 1098 1119 err_fdr: 1099 1120 put_unused_fd(fdr); 1100 1121 err_read_pipe: 1101 - path_put(&fr->f_path); 1102 - put_filp(fr); 1103 - err_write_pipe: 1104 - free_write_pipe(fw); 1122 + fput(files[0]); 1123 + fput(files[1]); 1105 1124 return error; 1106 1125 } 1107 1126

+3

fs/splice.c

··· 996 996 }; 997 997 ssize_t ret; 998 998 999 + sb_start_write(inode->i_sb); 1000 + 999 1001 pipe_lock(pipe); 1000 1002 1001 1003 splice_from_pipe_begin(&sd); ··· 1036 1034 *ppos += ret; 1037 1035 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1038 1036 } 1037 + sb_end_write(inode->i_sb); 1039 1038 1040 1039 return ret; 1041 1040 }

+230 -22

fs/super.c

··· 33 33 #include <linux/rculist_bl.h> 34 34 #include <linux/cleancache.h> 35 35 #include <linux/fsnotify.h> 36 + #include <linux/lockdep.h> 36 37 #include "internal.h" 37 38 38 39 39 40 LIST_HEAD(super_blocks); 40 41 DEFINE_SPINLOCK(sb_lock); 42 + 43 + static char *sb_writers_name[SB_FREEZE_LEVELS] = { 44 + "sb_writers", 45 + "sb_pagefaults", 46 + "sb_internal", 47 + }; 41 48 42 49 /* 43 50 * One thing we have to be careful of with a per-sb shrinker is that we don't ··· 109 102 return total_objects; 110 103 } 111 104 105 + static int init_sb_writers(struct super_block *s, struct file_system_type *type) 106 + { 107 + int err; 108 + int i; 109 + 110 + for (i = 0; i < SB_FREEZE_LEVELS; i++) { 111 + err = percpu_counter_init(&s->s_writers.counter[i], 0); 112 + if (err < 0) 113 + goto err_out; 114 + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 115 + &type->s_writers_key[i], 0); 116 + } 117 + init_waitqueue_head(&s->s_writers.wait); 118 + init_waitqueue_head(&s->s_writers.wait_unfrozen); 119 + return 0; 120 + err_out: 121 + while (--i >= 0) 122 + percpu_counter_destroy(&s->s_writers.counter[i]); 123 + return err; 124 + } 125 + 126 + static void destroy_sb_writers(struct super_block *s) 127 + { 128 + int i; 129 + 130 + for (i = 0; i < SB_FREEZE_LEVELS; i++) 131 + percpu_counter_destroy(&s->s_writers.counter[i]); 132 + } 133 + 112 134 /** 113 135 * alloc_super - create new superblock 114 136 * @type: filesystem type superblock should belong to ··· 153 117 154 118 if (s) { 155 119 if (security_sb_alloc(s)) { 120 + /* 121 + * We cannot call security_sb_free() without 122 + * security_sb_alloc() succeeding. So bail out manually 123 + */ 156 124 kfree(s); 157 125 s = NULL; 158 126 goto out; 159 127 } 160 128 #ifdef CONFIG_SMP 161 129 s->s_files = alloc_percpu(struct list_head); 162 - if (!s->s_files) { 163 - security_sb_free(s); 164 - kfree(s); 165 - s = NULL; 166 - goto out; 167 - } else { 130 + if (!s->s_files) 131 + goto err_out; 132 + else { 168 133 int i; 169 134 170 135 for_each_possible_cpu(i) ··· 174 137 #else 175 138 INIT_LIST_HEAD(&s->s_files); 176 139 #endif 140 + if (init_sb_writers(s, type)) 141 + goto err_out; 177 142 s->s_flags = flags; 178 143 s->s_bdi = &default_backing_dev_info; 179 144 INIT_HLIST_NODE(&s->s_instances); ··· 217 178 mutex_init(&s->s_dquot.dqio_mutex); 218 179 mutex_init(&s->s_dquot.dqonoff_mutex); 219 180 init_rwsem(&s->s_dquot.dqptr_sem); 220 - init_waitqueue_head(&s->s_wait_unfrozen); 221 181 s->s_maxbytes = MAX_NON_LFS; 222 182 s->s_op = &default_op; 223 183 s->s_time_gran = 1000000000; ··· 228 190 } 229 191 out: 230 192 return s; 193 + err_out: 194 + security_sb_free(s); 195 + #ifdef CONFIG_SMP 196 + if (s->s_files) 197 + free_percpu(s->s_files); 198 + #endif 199 + destroy_sb_writers(s); 200 + kfree(s); 201 + s = NULL; 202 + goto out; 231 203 } 232 204 233 205 /** ··· 251 203 #ifdef CONFIG_SMP 252 204 free_percpu(s->s_files); 253 205 #endif 206 + destroy_sb_writers(s); 254 207 security_sb_free(s); 255 208 WARN_ON(!list_empty(&s->s_mounts)); 256 209 kfree(s->s_subtype); ··· 700 651 { 701 652 while (1) { 702 653 struct super_block *s = get_super(bdev); 703 - if (!s || s->s_frozen == SB_UNFROZEN) 654 + if (!s || s->s_writers.frozen == SB_UNFROZEN) 704 655 return s; 705 656 up_read(&s->s_umount); 706 - vfs_check_frozen(s, SB_FREEZE_WRITE); 657 + wait_event(s->s_writers.wait_unfrozen, 658 + s->s_writers.frozen == SB_UNFROZEN); 707 659 put_super(s); 708 660 } 709 661 } ··· 782 732 int retval; 783 733 int remount_ro; 784 734 785 - if (sb->s_frozen != SB_UNFROZEN) 735 + if (sb->s_writers.frozen != SB_UNFROZEN) 786 736 return -EBUSY; 787 737 788 738 #ifdef CONFIG_BLOCK ··· 1213 1163 return ERR_PTR(error); 1214 1164 } 1215 1165 1166 + /* 1167 + * This is an internal function, please use sb_end_{write,pagefault,intwrite} 1168 + * instead. 1169 + */ 1170 + void __sb_end_write(struct super_block *sb, int level) 1171 + { 1172 + percpu_counter_dec(&sb->s_writers.counter[level-1]); 1173 + /* 1174 + * Make sure s_writers are updated before we wake up waiters in 1175 + * freeze_super(). 1176 + */ 1177 + smp_mb(); 1178 + if (waitqueue_active(&sb->s_writers.wait)) 1179 + wake_up(&sb->s_writers.wait); 1180 + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); 1181 + } 1182 + EXPORT_SYMBOL(__sb_end_write); 1183 + 1184 + #ifdef CONFIG_LOCKDEP 1185 + /* 1186 + * We want lockdep to tell us about possible deadlocks with freezing but 1187 + * it's it bit tricky to properly instrument it. Getting a freeze protection 1188 + * works as getting a read lock but there are subtle problems. XFS for example 1189 + * gets freeze protection on internal level twice in some cases, which is OK 1190 + * only because we already hold a freeze protection also on higher level. Due 1191 + * to these cases we have to tell lockdep we are doing trylock when we 1192 + * already hold a freeze protection for a higher freeze level. 1193 + */ 1194 + static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, 1195 + unsigned long ip) 1196 + { 1197 + int i; 1198 + 1199 + if (!trylock) { 1200 + for (i = 0; i < level - 1; i++) 1201 + if (lock_is_held(&sb->s_writers.lock_map[i])) { 1202 + trylock = true; 1203 + break; 1204 + } 1205 + } 1206 + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); 1207 + } 1208 + #endif 1209 + 1210 + /* 1211 + * This is an internal function, please use sb_start_{write,pagefault,intwrite} 1212 + * instead. 1213 + */ 1214 + int __sb_start_write(struct super_block *sb, int level, bool wait) 1215 + { 1216 + retry: 1217 + if (unlikely(sb->s_writers.frozen >= level)) { 1218 + if (!wait) 1219 + return 0; 1220 + wait_event(sb->s_writers.wait_unfrozen, 1221 + sb->s_writers.frozen < level); 1222 + } 1223 + 1224 + #ifdef CONFIG_LOCKDEP 1225 + acquire_freeze_lock(sb, level, !wait, _RET_IP_); 1226 + #endif 1227 + percpu_counter_inc(&sb->s_writers.counter[level-1]); 1228 + /* 1229 + * Make sure counter is updated before we check for frozen. 1230 + * freeze_super() first sets frozen and then checks the counter. 1231 + */ 1232 + smp_mb(); 1233 + if (unlikely(sb->s_writers.frozen >= level)) { 1234 + __sb_end_write(sb, level); 1235 + goto retry; 1236 + } 1237 + return 1; 1238 + } 1239 + EXPORT_SYMBOL(__sb_start_write); 1240 + 1241 + /** 1242 + * sb_wait_write - wait until all writers to given file system finish 1243 + * @sb: the super for which we wait 1244 + * @level: type of writers we wait for (normal vs page fault) 1245 + * 1246 + * This function waits until there are no writers of given type to given file 1247 + * system. Caller of this function should make sure there can be no new writers 1248 + * of type @level before calling this function. Otherwise this function can 1249 + * livelock. 1250 + */ 1251 + static void sb_wait_write(struct super_block *sb, int level) 1252 + { 1253 + s64 writers; 1254 + 1255 + /* 1256 + * We just cycle-through lockdep here so that it does not complain 1257 + * about returning with lock to userspace 1258 + */ 1259 + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); 1260 + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); 1261 + 1262 + do { 1263 + DEFINE_WAIT(wait); 1264 + 1265 + /* 1266 + * We use a barrier in prepare_to_wait() to separate setting 1267 + * of frozen and checking of the counter 1268 + */ 1269 + prepare_to_wait(&sb->s_writers.wait, &wait, 1270 + TASK_UNINTERRUPTIBLE); 1271 + 1272 + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); 1273 + if (writers) 1274 + schedule(); 1275 + 1276 + finish_wait(&sb->s_writers.wait, &wait); 1277 + } while (writers); 1278 + } 1279 + 1216 1280 /** 1217 1281 * freeze_super - lock the filesystem and force it into a consistent state 1218 1282 * @sb: the super to lock ··· 1334 1170 * Syncs the super to make sure the filesystem is consistent and calls the fs's 1335 1171 * freeze_fs. Subsequent calls to this without first thawing the fs will return 1336 1172 * -EBUSY. 1173 + * 1174 + * During this function, sb->s_writers.frozen goes through these values: 1175 + * 1176 + * SB_UNFROZEN: File system is normal, all writes progress as usual. 1177 + * 1178 + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New 1179 + * writes should be blocked, though page faults are still allowed. We wait for 1180 + * all writes to complete and then proceed to the next stage. 1181 + * 1182 + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked 1183 + * but internal fs threads can still modify the filesystem (although they 1184 + * should not dirty new pages or inodes), writeback can run etc. After waiting 1185 + * for all running page faults we sync the filesystem which will clean all 1186 + * dirty pages and inodes (no new dirty pages or inodes can be created when 1187 + * sync is running). 1188 + * 1189 + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs 1190 + * modification are blocked (e.g. XFS preallocation truncation on inode 1191 + * reclaim). This is usually implemented by blocking new transactions for 1192 + * filesystems that have them and need this additional guard. After all 1193 + * internal writers are finished we call ->freeze_fs() to finish filesystem 1194 + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is 1195 + * mostly auxiliary for filesystems to verify they do not modify frozen fs. 1196 + * 1197 + * sb->s_writers.frozen is protected by sb->s_umount. 1337 1198 */ 1338 1199 int freeze_super(struct super_block *sb) 1339 1200 { ··· 1366 1177 1367 1178 atomic_inc(&sb->s_active); 1368 1179 down_write(&sb->s_umount); 1369 - if (sb->s_frozen) { 1180 + if (sb->s_writers.frozen != SB_UNFROZEN) { 1370 1181 deactivate_locked_super(sb); 1371 1182 return -EBUSY; 1372 1183 } ··· 1377 1188 } 1378 1189 1379 1190 if (sb->s_flags & MS_RDONLY) { 1380 - sb->s_frozen = SB_FREEZE_TRANS; 1381 - smp_wmb(); 1191 + /* Nothing to do really... */ 1192 + sb->s_writers.frozen = SB_FREEZE_COMPLETE; 1382 1193 up_write(&sb->s_umount); 1383 1194 return 0; 1384 1195 } 1385 1196 1386 - sb->s_frozen = SB_FREEZE_WRITE; 1197 + /* From now on, no new normal writers can start */ 1198 + sb->s_writers.frozen = SB_FREEZE_WRITE; 1387 1199 smp_wmb(); 1388 1200 1201 + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ 1202 + up_write(&sb->s_umount); 1203 + 1204 + sb_wait_write(sb, SB_FREEZE_WRITE); 1205 + 1206 + /* Now we go and block page faults... */ 1207 + down_write(&sb->s_umount); 1208 + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; 1209 + smp_wmb(); 1210 + 1211 + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); 1212 + 1213 + /* All writers are done so after syncing there won't be dirty data */ 1389 1214 sync_filesystem(sb); 1390 1215 1391 - sb->s_frozen = SB_FREEZE_TRANS; 1216 + /* Now wait for internal filesystem counter */ 1217 + sb->s_writers.frozen = SB_FREEZE_FS; 1392 1218 smp_wmb(); 1219 + sb_wait_write(sb, SB_FREEZE_FS); 1393 1220 1394 - sync_blockdev(sb->s_bdev); 1395 1221 if (sb->s_op->freeze_fs) { 1396 1222 ret = sb->s_op->freeze_fs(sb); 1397 1223 if (ret) { 1398 1224 printk(KERN_ERR 1399 1225 "VFS:Filesystem freeze failed\n"); 1400 - sb->s_frozen = SB_UNFROZEN; 1226 + sb->s_writers.frozen = SB_UNFROZEN; 1401 1227 smp_wmb(); 1402 - wake_up(&sb->s_wait_unfrozen); 1228 + wake_up(&sb->s_writers.wait_unfrozen); 1403 1229 deactivate_locked_super(sb); 1404 1230 return ret; 1405 1231 } 1406 1232 } 1233 + /* 1234 + * This is just for debugging purposes so that fs can warn if it 1235 + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. 1236 + */ 1237 + sb->s_writers.frozen = SB_FREEZE_COMPLETE; 1407 1238 up_write(&sb->s_umount); 1408 1239 return 0; 1409 1240 } ··· 1440 1231 int error; 1441 1232 1442 1233 down_write(&sb->s_umount); 1443 - if (sb->s_frozen == SB_UNFROZEN) { 1234 + if (sb->s_writers.frozen == SB_UNFROZEN) { 1444 1235 up_write(&sb->s_umount); 1445 1236 return -EINVAL; 1446 1237 } ··· 1453 1244 if (error) { 1454 1245 printk(KERN_ERR 1455 1246 "VFS:Filesystem thaw failed\n"); 1456 - sb->s_frozen = SB_FREEZE_TRANS; 1457 1247 up_write(&sb->s_umount); 1458 1248 return error; 1459 1249 } 1460 1250 } 1461 1251 1462 1252 out: 1463 - sb->s_frozen = SB_UNFROZEN; 1253 + sb->s_writers.frozen = SB_UNFROZEN; 1464 1254 smp_wmb(); 1465 - wake_up(&sb->s_wait_unfrozen); 1255 + wake_up(&sb->s_writers.wait_unfrozen); 1466 1256 deactivate_locked_super(sb); 1467 1257 1468 1258 return 0;

+2

fs/sysfs/bin.c

··· 228 228 ret = 0; 229 229 if (bb->vm_ops->page_mkwrite) 230 230 ret = bb->vm_ops->page_mkwrite(vma, vmf); 231 + else 232 + file_update_time(file); 231 233 232 234 sysfs_put_active(attr_sd); 233 235 return ret;

+18

fs/xfs/xfs_aops.c

··· 124 124 ioend->io_append_trans = tp; 125 125 126 126 /* 127 + * We will pass freeze protection with a transaction. So tell lockdep 128 + * we released it. 129 + */ 130 + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 131 + 1, _THIS_IP_); 132 + /* 127 133 * We hand off the transaction to the completion thread now, so 128 134 * clear the flag here. 129 135 */ ··· 205 199 struct xfs_inode *ip = XFS_I(ioend->io_inode); 206 200 int error = 0; 207 201 202 + if (ioend->io_append_trans) { 203 + /* 204 + * We've got freeze protection passed with the transaction. 205 + * Tell lockdep about it. 206 + */ 207 + rwsem_acquire_read( 208 + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 209 + 0, 1, _THIS_IP_); 210 + } 208 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 209 212 ioend->io_error = -EIO; 210 213 goto done; ··· 1440 1425 if (ioend->io_append_trans) { 1441 1426 current_set_flags_nested(&ioend->io_append_trans->t_pflags, 1442 1427 PF_FSTRANS); 1428 + rwsem_acquire_read( 1429 + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 1430 + 0, 1, _THIS_IP_); 1443 1431 xfs_trans_cancel(ioend->io_append_trans, 0); 1444 1432 } 1445 1433 out_destroy_ioend:

+7 -3

fs/xfs/xfs_file.c

··· 770 770 if (ocount == 0) 771 771 return 0; 772 772 773 - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); 773 + sb_start_write(inode->i_sb); 774 774 775 - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 776 - return -EIO; 775 + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 776 + ret = -EIO; 777 + goto out; 778 + } 777 779 778 780 if (unlikely(file->f_flags & O_DIRECT)) 779 781 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); ··· 794 792 ret = err; 795 793 } 796 794 795 + out: 796 + sb_end_write(inode->i_sb); 797 797 return ret; 798 798 } 799 799

+52 -3

fs/xfs/xfs_ioctl.c

··· 364 364 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 365 365 return -XFS_ERROR(EFAULT); 366 366 367 + error = mnt_want_write_file(parfilp); 368 + if (error) 369 + return error; 370 + 367 371 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); 368 - if (IS_ERR(dentry)) 372 + if (IS_ERR(dentry)) { 373 + mnt_drop_write_file(parfilp); 369 374 return PTR_ERR(dentry); 375 + } 370 376 371 377 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { 372 378 error = -XFS_ERROR(EPERM); ··· 388 382 fsd.fsd_dmstate); 389 383 390 384 out: 385 + mnt_drop_write_file(parfilp); 391 386 dput(dentry); 392 387 return error; 393 388 } ··· 641 634 if (ioflags & IO_INVIS) 642 635 attr_flags |= XFS_ATTR_DMI; 643 636 637 + error = mnt_want_write_file(filp); 638 + if (error) 639 + return error; 644 640 error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); 641 + mnt_drop_write_file(filp); 645 642 return -error; 646 643 } 647 644 ··· 1174 1163 { 1175 1164 struct fsxattr fa; 1176 1165 unsigned int mask; 1166 + int error; 1177 1167 1178 1168 if (copy_from_user(&fa, arg, sizeof(fa))) 1179 1169 return -EFAULT; ··· 1183 1171 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 1184 1172 mask |= FSX_NONBLOCK; 1185 1173 1186 - return -xfs_ioctl_setattr(ip, &fa, mask); 1174 + error = mnt_want_write_file(filp); 1175 + if (error) 1176 + return error; 1177 + error = xfs_ioctl_setattr(ip, &fa, mask); 1178 + mnt_drop_write_file(filp); 1179 + return -error; 1187 1180 } 1188 1181 1189 1182 STATIC int ··· 1213 1196 struct fsxattr fa; 1214 1197 unsigned int flags; 1215 1198 unsigned int mask; 1199 + int error; 1216 1200 1217 1201 if (copy_from_user(&flags, arg, sizeof(flags))) 1218 1202 return -EFAULT; ··· 1228 1210 mask |= FSX_NONBLOCK; 1229 1211 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); 1230 1212 1231 - return -xfs_ioctl_setattr(ip, &fa, mask); 1213 + error = mnt_want_write_file(filp); 1214 + if (error) 1215 + return error; 1216 + error = xfs_ioctl_setattr(ip, &fa, mask); 1217 + mnt_drop_write_file(filp); 1218 + return -error; 1232 1219 } 1233 1220 1234 1221 STATIC int ··· 1408 1385 if (copy_from_user(&dmi, arg, sizeof(dmi))) 1409 1386 return -XFS_ERROR(EFAULT); 1410 1387 1388 + error = mnt_want_write_file(filp); 1389 + if (error) 1390 + return error; 1391 + 1411 1392 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, 1412 1393 dmi.fsd_dmstate); 1394 + mnt_drop_write_file(filp); 1413 1395 return -error; 1414 1396 } 1415 1397 ··· 1462 1434 1463 1435 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) 1464 1436 return -XFS_ERROR(EFAULT); 1437 + error = mnt_want_write_file(filp); 1438 + if (error) 1439 + return error; 1465 1440 error = xfs_swapext(&sxp); 1441 + mnt_drop_write_file(filp); 1466 1442 return -error; 1467 1443 } 1468 1444 ··· 1495 1463 if (copy_from_user(&inout, arg, sizeof(inout))) 1496 1464 return -XFS_ERROR(EFAULT); 1497 1465 1466 + error = mnt_want_write_file(filp); 1467 + if (error) 1468 + return error; 1469 + 1498 1470 /* input parameter is passed in resblks field of structure */ 1499 1471 in = inout.resblks; 1500 1472 error = xfs_reserve_blocks(mp, &in, &inout); 1473 + mnt_drop_write_file(filp); 1501 1474 if (error) 1502 1475 return -error; 1503 1476 ··· 1533 1496 if (copy_from_user(&in, arg, sizeof(in))) 1534 1497 return -XFS_ERROR(EFAULT); 1535 1498 1499 + error = mnt_want_write_file(filp); 1500 + if (error) 1501 + return error; 1536 1502 error = xfs_growfs_data(mp, &in); 1503 + mnt_drop_write_file(filp); 1537 1504 return -error; 1538 1505 } 1539 1506 ··· 1547 1506 if (copy_from_user(&in, arg, sizeof(in))) 1548 1507 return -XFS_ERROR(EFAULT); 1549 1508 1509 + error = mnt_want_write_file(filp); 1510 + if (error) 1511 + return error; 1550 1512 error = xfs_growfs_log(mp, &in); 1513 + mnt_drop_write_file(filp); 1551 1514 return -error; 1552 1515 } 1553 1516 ··· 1561 1516 if (copy_from_user(&in, arg, sizeof(in))) 1562 1517 return -XFS_ERROR(EFAULT); 1563 1518 1519 + error = mnt_want_write_file(filp); 1520 + if (error) 1521 + return error; 1564 1522 error = xfs_growfs_rt(mp, &in); 1523 + mnt_drop_write_file(filp); 1565 1524 return -error; 1566 1525 } 1567 1526

+12

fs/xfs/xfs_ioctl32.c

··· 600 600 601 601 if (xfs_compat_growfs_data_copyin(&in, arg)) 602 602 return -XFS_ERROR(EFAULT); 603 + error = mnt_want_write_file(filp); 604 + if (error) 605 + return error; 603 606 error = xfs_growfs_data(mp, &in); 607 + mnt_drop_write_file(filp); 604 608 return -error; 605 609 } 606 610 case XFS_IOC_FSGROWFSRT_32: { ··· 612 608 613 609 if (xfs_compat_growfs_rt_copyin(&in, arg)) 614 610 return -XFS_ERROR(EFAULT); 611 + error = mnt_want_write_file(filp); 612 + if (error) 613 + return error; 615 614 error = xfs_growfs_rt(mp, &in); 615 + mnt_drop_write_file(filp); 616 616 return -error; 617 617 } 618 618 #endif ··· 635 627 offsetof(struct xfs_swapext, sx_stat)) || 636 628 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) 637 629 return -XFS_ERROR(EFAULT); 630 + error = mnt_want_write_file(filp); 631 + if (error) 632 + return error; 638 633 error = xfs_swapext(&sxp); 634 + mnt_drop_write_file(filp); 639 635 return -error; 640 636 } 641 637 case XFS_IOC_FSBULKSTAT_32:

+2 -2

fs/xfs/xfs_iomap.c

··· 680 680 * the same inode that we complete here and might deadlock 681 681 * on the iolock. 682 682 */ 683 - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 683 + sb_start_intwrite(mp->m_super); 684 684 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); 685 - tp->t_flags |= XFS_TRANS_RESERVE; 685 + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; 686 686 error = xfs_trans_reserve(tp, resblks, 687 687 XFS_WRITE_LOG_RES(mp), 0, 688 688 XFS_TRANS_PERM_LOG_RES,

+1 -1

fs/xfs/xfs_mount.c

··· 1551 1551 int 1552 1552 xfs_fs_writable(xfs_mount_t *mp) 1553 1553 { 1554 - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || 1554 + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || 1555 1555 (mp->m_flags & XFS_MOUNT_RDONLY)); 1556 1556 } 1557 1557

-3

fs/xfs/xfs_mount.h

··· 311 311 #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ 312 312 #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ 313 313 314 - #define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) 315 - #define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) 316 - 317 314 /* 318 315 * Flags for xfs_mountfs 319 316 */

+1 -1

fs/xfs/xfs_sync.c

··· 403 403 if (!(mp->m_super->s_flags & MS_ACTIVE) && 404 404 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 405 405 /* dgc: errors ignored here */ 406 - if (mp->m_super->s_frozen == SB_UNFROZEN && 406 + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && 407 407 xfs_log_need_covered(mp)) 408 408 error = xfs_fs_log_dummy(mp); 409 409 else

+14 -3

fs/xfs/xfs_trans.c

··· 576 576 xfs_mount_t *mp, 577 577 uint type) 578 578 { 579 - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); 580 - return _xfs_trans_alloc(mp, type, KM_SLEEP); 579 + xfs_trans_t *tp; 580 + 581 + sb_start_intwrite(mp->m_super); 582 + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); 583 + tp->t_flags |= XFS_TRANS_FREEZE_PROT; 584 + return tp; 581 585 } 582 586 583 587 xfs_trans_t * ··· 592 588 { 593 589 xfs_trans_t *tp; 594 590 591 + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); 595 592 atomic_inc(&mp->m_active_trans); 596 593 597 594 tp = kmem_zone_zalloc(xfs_trans_zone, memflags); ··· 616 611 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); 617 612 618 613 atomic_dec(&tp->t_mountp->m_active_trans); 614 + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) 615 + sb_end_intwrite(tp->t_mountp->m_super); 619 616 xfs_trans_free_dqinfo(tp); 620 617 kmem_zone_free(xfs_trans_zone, tp); 621 618 } ··· 650 643 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 651 644 ASSERT(tp->t_ticket != NULL); 652 645 653 - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); 646 + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | 647 + (tp->t_flags & XFS_TRANS_RESERVE) | 648 + (tp->t_flags & XFS_TRANS_FREEZE_PROT); 649 + /* We gave our writer reference to the new transaction */ 650 + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; 654 651 ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); 655 652 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; 656 653 tp->t_blk_res = tp->t_blk_res_used;

+2

fs/xfs/xfs_trans.h

··· 179 179 #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ 180 180 #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ 181 181 #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ 182 + #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer 183 + count in superblock */ 182 184 183 185 /* 184 186 * Values for call flags parameter.

+4

include/linux/audit.h

··· 130 130 #define AUDIT_LAST_KERN_ANOM_MSG 1799 131 131 #define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */ 132 132 #define AUDIT_ANOM_ABEND 1701 /* Process ended abnormally */ 133 + #define AUDIT_ANOM_LINK 1702 /* Suspicious use of file links */ 133 134 #define AUDIT_INTEGRITY_DATA 1800 /* Data integrity verification */ 134 135 #define AUDIT_INTEGRITY_METADATA 1801 /* Metadata integrity verification */ 135 136 #define AUDIT_INTEGRITY_STATUS 1802 /* Integrity enable status */ ··· 688 687 const struct path *path); 689 688 extern void audit_log_key(struct audit_buffer *ab, 690 689 char *key); 690 + extern void audit_log_link_denied(const char *operation, 691 + struct path *link); 691 692 extern void audit_log_lost(const char *message); 692 693 #ifdef CONFIG_SECURITY 693 694 extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); ··· 719 716 #define audit_log_untrustedstring(a,s) do { ; } while (0) 720 717 #define audit_log_d_path(b, p, d) do { ; } while (0) 721 718 #define audit_log_key(b, k) do { ; } while (0) 719 + #define audit_log_link_denied(o, l) do { ; } while (0) 722 720 #define audit_log_secctx(b,s) do { ; } while (0) 723 721 #define audit_enabled 0 724 722 #endif

+142 -12

include/linux/fs.h

··· 414 414 #include <linux/shrinker.h> 415 415 #include <linux/migrate_mode.h> 416 416 #include <linux/uidgid.h> 417 + #include <linux/lockdep.h> 417 418 418 419 #include <asm/byteorder.h> 419 420 ··· 441 440 extern int sysctl_nr_open; 442 441 extern struct inodes_stat_t inodes_stat; 443 442 extern int leases_enable, lease_break_time; 443 + extern int sysctl_protected_symlinks; 444 + extern int sysctl_protected_hardlinks; 444 445 445 446 struct buffer_head; 446 447 typedef int (get_block_t)(struct inode *inode, sector_t iblock, ··· 1448 1445 extern pid_t f_getown(struct file *filp); 1449 1446 extern int send_sigurg(struct fown_struct *fown); 1450 1447 1448 + struct mm_struct; 1449 + 1451 1450 /* 1452 1451 * Umount options 1453 1452 */ ··· 1462 1457 1463 1458 extern struct list_head super_blocks; 1464 1459 extern spinlock_t sb_lock; 1460 + 1461 + /* Possible states of 'frozen' field */ 1462 + enum { 1463 + SB_UNFROZEN = 0, /* FS is unfrozen */ 1464 + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ 1465 + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ 1466 + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop 1467 + * internal threads if needed) */ 1468 + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ 1469 + }; 1470 + 1471 + #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) 1472 + 1473 + struct sb_writers { 1474 + /* Counters for counting writers at each level */ 1475 + struct percpu_counter counter[SB_FREEZE_LEVELS]; 1476 + wait_queue_head_t wait; /* queue for waiting for 1477 + writers / faults to finish */ 1478 + int frozen; /* Is sb frozen? */ 1479 + wait_queue_head_t wait_unfrozen; /* queue for waiting for 1480 + sb to be thawed */ 1481 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 1482 + struct lockdep_map lock_map[SB_FREEZE_LEVELS]; 1483 + #endif 1484 + }; 1465 1485 1466 1486 struct super_block { 1467 1487 struct list_head s_list; /* Keep this first */ ··· 1535 1505 struct hlist_node s_instances; 1536 1506 struct quota_info s_dquot; /* Diskquota specific options */ 1537 1507 1538 - int s_frozen; 1539 - wait_queue_head_t s_wait_unfrozen; 1508 + struct sb_writers s_writers; 1540 1509 1541 1510 char s_id[32]; /* Informational name */ 1542 1511 u8 s_uuid[16]; /* UUID */ ··· 1590 1561 /* 1591 1562 * Snapshotting support. 1592 1563 */ 1593 - enum { 1594 - SB_UNFROZEN = 0, 1595 - SB_FREEZE_WRITE = 1, 1596 - SB_FREEZE_TRANS = 2, 1597 - }; 1598 1564 1599 - #define vfs_check_frozen(sb, level) \ 1600 - wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) 1565 + void __sb_end_write(struct super_block *sb, int level); 1566 + int __sb_start_write(struct super_block *sb, int level, bool wait); 1567 + 1568 + /** 1569 + * sb_end_write - drop write access to a superblock 1570 + * @sb: the super we wrote to 1571 + * 1572 + * Decrement number of writers to the filesystem. Wake up possible waiters 1573 + * wanting to freeze the filesystem. 1574 + */ 1575 + static inline void sb_end_write(struct super_block *sb) 1576 + { 1577 + __sb_end_write(sb, SB_FREEZE_WRITE); 1578 + } 1579 + 1580 + /** 1581 + * sb_end_pagefault - drop write access to a superblock from a page fault 1582 + * @sb: the super we wrote to 1583 + * 1584 + * Decrement number of processes handling write page fault to the filesystem. 1585 + * Wake up possible waiters wanting to freeze the filesystem. 1586 + */ 1587 + static inline void sb_end_pagefault(struct super_block *sb) 1588 + { 1589 + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); 1590 + } 1591 + 1592 + /** 1593 + * sb_end_intwrite - drop write access to a superblock for internal fs purposes 1594 + * @sb: the super we wrote to 1595 + * 1596 + * Decrement fs-internal number of writers to the filesystem. Wake up possible 1597 + * waiters wanting to freeze the filesystem. 1598 + */ 1599 + static inline void sb_end_intwrite(struct super_block *sb) 1600 + { 1601 + __sb_end_write(sb, SB_FREEZE_FS); 1602 + } 1603 + 1604 + /** 1605 + * sb_start_write - get write access to a superblock 1606 + * @sb: the super we write to 1607 + * 1608 + * When a process wants to write data or metadata to a file system (i.e. dirty 1609 + * a page or an inode), it should embed the operation in a sb_start_write() - 1610 + * sb_end_write() pair to get exclusion against file system freezing. This 1611 + * function increments number of writers preventing freezing. If the file 1612 + * system is already frozen, the function waits until the file system is 1613 + * thawed. 1614 + * 1615 + * Since freeze protection behaves as a lock, users have to preserve 1616 + * ordering of freeze protection and other filesystem locks. Generally, 1617 + * freeze protection should be the outermost lock. In particular, we have: 1618 + * 1619 + * sb_start_write 1620 + * -> i_mutex (write path, truncate, directory ops, ...) 1621 + * -> s_umount (freeze_super, thaw_super) 1622 + */ 1623 + static inline void sb_start_write(struct super_block *sb) 1624 + { 1625 + __sb_start_write(sb, SB_FREEZE_WRITE, true); 1626 + } 1627 + 1628 + static inline int sb_start_write_trylock(struct super_block *sb) 1629 + { 1630 + return __sb_start_write(sb, SB_FREEZE_WRITE, false); 1631 + } 1632 + 1633 + /** 1634 + * sb_start_pagefault - get write access to a superblock from a page fault 1635 + * @sb: the super we write to 1636 + * 1637 + * When a process starts handling write page fault, it should embed the 1638 + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get 1639 + * exclusion against file system freezing. This is needed since the page fault 1640 + * is going to dirty a page. This function increments number of running page 1641 + * faults preventing freezing. If the file system is already frozen, the 1642 + * function waits until the file system is thawed. 1643 + * 1644 + * Since page fault freeze protection behaves as a lock, users have to preserve 1645 + * ordering of freeze protection and other filesystem locks. It is advised to 1646 + * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault 1647 + * handling code implies lock dependency: 1648 + * 1649 + * mmap_sem 1650 + * -> sb_start_pagefault 1651 + */ 1652 + static inline void sb_start_pagefault(struct super_block *sb) 1653 + { 1654 + __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); 1655 + } 1656 + 1657 + /* 1658 + * sb_start_intwrite - get write access to a superblock for internal fs purposes 1659 + * @sb: the super we write to 1660 + * 1661 + * This is the third level of protection against filesystem freezing. It is 1662 + * free for use by a filesystem. The only requirement is that it must rank 1663 + * below sb_start_pagefault. 1664 + * 1665 + * For example filesystem can call sb_start_intwrite() when starting a 1666 + * transaction which somewhat eases handling of freezing for internal sources 1667 + * of filesystem changes (internal fs threads, discarding preallocation on file 1668 + * close, etc.). 1669 + */ 1670 + static inline void sb_start_intwrite(struct super_block *sb) 1671 + { 1672 + __sb_start_write(sb, SB_FREEZE_FS, true); 1673 + } 1674 + 1601 1675 1602 1676 extern bool inode_owner_or_capable(const struct inode *inode); 1603 1677 ··· 2024 1892 struct lock_class_key s_lock_key; 2025 1893 struct lock_class_key s_umount_key; 2026 1894 struct lock_class_key s_vfs_rename_key; 1895 + struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; 2027 1896 2028 1897 struct lock_class_key i_lock_key; 2029 1898 struct lock_class_key i_mutex_key; ··· 2467 2334 } 2468 2335 #endif 2469 2336 extern int do_pipe_flags(int *, int); 2470 - extern struct file *create_read_pipe(struct file *f, int flags); 2471 - extern struct file *create_write_pipe(int flags); 2472 - extern void free_write_pipe(struct file *); 2473 2337 2474 2338 extern int kernel_read(struct file *, loff_t, char *, unsigned long); 2475 2339 extern struct file * open_exec(const char *);

+1

include/linux/mm.h

··· 1441 1441 1442 1442 /* generic vm_area_ops exported for stackable file systems */ 1443 1443 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 1444 + extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1444 1445 1445 1446 /* mm/page-writeback.c */ 1446 1447 int write_one_page(struct page *page, int wait);

+1

include/linux/namei.h

··· 67 67 68 68 extern struct dentry *kern_path_create(int, const char *, struct path *, int); 69 69 extern struct dentry *user_path_create(int, const char __user *, struct path *, int); 70 + extern void done_path_create(struct path *, struct dentry *); 70 71 extern struct dentry *kern_path_locked(const char *, struct path *); 71 72 extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 72 73 const char *, unsigned int, struct path *);

+1

include/linux/nfsd/nfsfh.h

··· 143 143 int fh_maxsize; /* max size for fh_handle */ 144 144 145 145 unsigned char fh_locked; /* inode locked by us */ 146 + unsigned char fh_want_write; /* remount protection taken */ 146 147 147 148 #ifdef CONFIG_NFSD_V3 148 149 unsigned char fh_post_saved; /* post-op attrs saved */

+2

include/linux/pipe_fs_i.h

··· 160 160 long pipe_fcntl(struct file *, unsigned int, unsigned long arg); 161 161 struct pipe_inode_info *get_pipe_info(struct file *file); 162 162 163 + int create_pipe_files(struct file **, int); 164 + 163 165 #endif

+21

kernel/audit.c

··· 1456 1456 } 1457 1457 1458 1458 /** 1459 + * audit_log_link_denied - report a link restriction denial 1460 + * @operation: specific link opreation 1461 + * @link: the path that triggered the restriction 1462 + */ 1463 + void audit_log_link_denied(const char *operation, struct path *link) 1464 + { 1465 + struct audit_buffer *ab; 1466 + 1467 + ab = audit_log_start(current->audit_context, GFP_KERNEL, 1468 + AUDIT_ANOM_LINK); 1469 + audit_log_format(ab, "op=%s action=denied", operation); 1470 + audit_log_format(ab, " pid=%d comm=", current->pid); 1471 + audit_log_untrustedstring(ab, current->comm); 1472 + audit_log_d_path(ab, " path=", link); 1473 + audit_log_format(ab, " dev="); 1474 + audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); 1475 + audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); 1476 + audit_log_end(ab); 1477 + } 1478 + 1479 + /** 1459 1480 * audit_log_end - end one audit record 1460 1481 * @ab: the audit_buffer 1461 1482 *

+18

kernel/sysctl.c

··· 1498 1498 #endif 1499 1499 #endif 1500 1500 { 1501 + .procname = "protected_symlinks", 1502 + .data = &sysctl_protected_symlinks, 1503 + .maxlen = sizeof(int), 1504 + .mode = 0600, 1505 + .proc_handler = proc_dointvec_minmax, 1506 + .extra1 = &zero, 1507 + .extra2 = &one, 1508 + }, 1509 + { 1510 + .procname = "protected_hardlinks", 1511 + .data = &sysctl_protected_hardlinks, 1512 + .maxlen = sizeof(int), 1513 + .mode = 0600, 1514 + .proc_handler = proc_dointvec_minmax, 1515 + .extra1 = &zero, 1516 + .extra2 = &one, 1517 + }, 1518 + { 1501 1519 .procname = "suid_dumpable", 1502 1520 .data = &suid_dumpable, 1503 1521 .maxlen = sizeof(int),

+7 -7

lib/percpu_counter.c

··· 12 12 13 13 #ifdef CONFIG_HOTPLUG_CPU 14 14 static LIST_HEAD(percpu_counters); 15 - static DEFINE_MUTEX(percpu_counters_lock); 15 + static DEFINE_SPINLOCK(percpu_counters_lock); 16 16 #endif 17 17 18 18 #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER ··· 123 123 124 124 #ifdef CONFIG_HOTPLUG_CPU 125 125 INIT_LIST_HEAD(&fbc->list); 126 - mutex_lock(&percpu_counters_lock); 126 + spin_lock(&percpu_counters_lock); 127 127 list_add(&fbc->list, &percpu_counters); 128 - mutex_unlock(&percpu_counters_lock); 128 + spin_unlock(&percpu_counters_lock); 129 129 #endif 130 130 return 0; 131 131 } ··· 139 139 debug_percpu_counter_deactivate(fbc); 140 140 141 141 #ifdef CONFIG_HOTPLUG_CPU 142 - mutex_lock(&percpu_counters_lock); 142 + spin_lock(&percpu_counters_lock); 143 143 list_del(&fbc->list); 144 - mutex_unlock(&percpu_counters_lock); 144 + spin_unlock(&percpu_counters_lock); 145 145 #endif 146 146 free_percpu(fbc->counters); 147 147 fbc->counters = NULL; ··· 170 170 return NOTIFY_OK; 171 171 172 172 cpu = (unsigned long)hcpu; 173 - mutex_lock(&percpu_counters_lock); 173 + spin_lock(&percpu_counters_lock); 174 174 list_for_each_entry(fbc, &percpu_counters, list) { 175 175 s32 *pcount; 176 176 unsigned long flags; ··· 181 181 *pcount = 0; 182 182 raw_spin_unlock_irqrestore(&fbc->lock, flags); 183 183 } 184 - mutex_unlock(&percpu_counters_lock); 184 + spin_unlock(&percpu_counters_lock); 185 185 #endif 186 186 return NOTIFY_OK; 187 187 }

+29 -2

mm/filemap.c

··· 1712 1712 } 1713 1713 EXPORT_SYMBOL(filemap_fault); 1714 1714 1715 + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1716 + { 1717 + struct page *page = vmf->page; 1718 + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1719 + int ret = VM_FAULT_LOCKED; 1720 + 1721 + sb_start_pagefault(inode->i_sb); 1722 + file_update_time(vma->vm_file); 1723 + lock_page(page); 1724 + if (page->mapping != inode->i_mapping) { 1725 + unlock_page(page); 1726 + ret = VM_FAULT_NOPAGE; 1727 + goto out; 1728 + } 1729 + /* 1730 + * We mark the page dirty already here so that when freeze is in 1731 + * progress, we are guaranteed that writeback during freezing will 1732 + * see the dirty page and writeprotect it again. 1733 + */ 1734 + set_page_dirty(page); 1735 + out: 1736 + sb_end_pagefault(inode->i_sb); 1737 + return ret; 1738 + } 1739 + EXPORT_SYMBOL(filemap_page_mkwrite); 1740 + 1715 1741 const struct vm_operations_struct generic_file_vm_ops = { 1716 1742 .fault = filemap_fault, 1743 + .page_mkwrite = filemap_page_mkwrite, 1717 1744 }; 1718 1745 1719 1746 /* This is used for a general mmap of a disk file */ ··· 2434 2407 count = ocount; 2435 2408 pos = *ppos; 2436 2409 2437 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2438 - 2439 2410 /* We can write back this queue in page reclaim */ 2440 2411 current->backing_dev_info = mapping->backing_dev_info; 2441 2412 written = 0; ··· 2532 2507 2533 2508 BUG_ON(iocb->ki_pos != pos); 2534 2509 2510 + sb_start_write(inode->i_sb); 2535 2511 mutex_lock(&inode->i_mutex); 2536 2512 blk_start_plug(&plug); 2537 2513 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); ··· 2546 2520 ret = err; 2547 2521 } 2548 2522 blk_finish_plug(&plug); 2523 + sb_end_write(inode->i_sb); 2549 2524 return ret; 2550 2525 } 2551 2526 EXPORT_SYMBOL(generic_file_aio_write);

+4 -2

mm/filemap_xip.c

··· 304 304 305 305 static const struct vm_operations_struct xip_file_vm_ops = { 306 306 .fault = xip_file_fault, 307 + .page_mkwrite = filemap_page_mkwrite, 307 308 }; 308 309 309 310 int xip_file_mmap(struct file * file, struct vm_area_struct * vma) ··· 402 401 loff_t pos; 403 402 ssize_t ret; 404 403 404 + sb_start_write(inode->i_sb); 405 + 405 406 mutex_lock(&inode->i_mutex); 406 407 407 408 if (!access_ok(VERIFY_READ, buf, len)) { ··· 413 410 414 411 pos = *ppos; 415 412 count = len; 416 - 417 - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 418 413 419 414 /* We can write back this queue in page reclaim */ 420 415 current->backing_dev_info = mapping->backing_dev_info; ··· 437 436 current->backing_dev_info = NULL; 438 437 out_up: 439 438 mutex_unlock(&inode->i_mutex); 439 + sb_end_write(inode->i_sb); 440 440 return ret; 441 441 } 442 442 EXPORT_SYMBOL_GPL(xip_file_write);

+7 -7

mm/memory.c

··· 2650 2650 if (!page_mkwrite) { 2651 2651 wait_on_page_locked(dirty_page); 2652 2652 set_page_dirty_balance(dirty_page, page_mkwrite); 2653 + /* file_update_time outside page_lock */ 2654 + if (vma->vm_file) 2655 + file_update_time(vma->vm_file); 2653 2656 } 2654 2657 put_page(dirty_page); 2655 2658 if (page_mkwrite) { ··· 2669 2666 balance_dirty_pages_ratelimited(mapping); 2670 2667 } 2671 2668 } 2672 - 2673 - /* file_update_time outside page_lock */ 2674 - if (vma->vm_file) 2675 - file_update_time(vma->vm_file); 2676 2669 2677 2670 return ret; 2678 2671 } ··· 3338 3339 3339 3340 if (dirty_page) { 3340 3341 struct address_space *mapping = page->mapping; 3342 + int dirtied = 0; 3341 3343 3342 3344 if (set_page_dirty(dirty_page)) 3343 - page_mkwrite = 1; 3345 + dirtied = 1; 3344 3346 unlock_page(dirty_page); 3345 3347 put_page(dirty_page); 3346 - if (page_mkwrite && mapping) { 3348 + if ((dirtied || page_mkwrite) && mapping) { 3347 3349 /* 3348 3350 * Some device drivers do not set page.mapping but still 3349 3351 * dirty their pages ··· 3353 3353 } 3354 3354 3355 3355 /* file_update_time outside page_lock */ 3356 - if (vma->vm_file) 3356 + if (vma->vm_file && !page_mkwrite) 3357 3357 file_update_time(vma->vm_file); 3358 3358 } else { 3359 3359 unlock_page(vmf.page);

+43 -50

net/unix/af_unix.c

··· 823 823 return NULL; 824 824 } 825 825 826 + static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) 827 + { 828 + struct dentry *dentry; 829 + struct path path; 830 + int err = 0; 831 + /* 832 + * Get the parent directory, calculate the hash for last 833 + * component. 834 + */ 835 + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); 836 + err = PTR_ERR(dentry); 837 + if (IS_ERR(dentry)) 838 + return err; 839 + 840 + /* 841 + * All right, let's create it. 842 + */ 843 + err = security_path_mknod(&path, dentry, mode, 0); 844 + if (!err) { 845 + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 846 + if (!err) { 847 + res->mnt = mntget(path.mnt); 848 + res->dentry = dget(dentry); 849 + } 850 + } 851 + done_path_create(&path, dentry); 852 + return err; 853 + } 826 854 827 855 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 828 856 { ··· 859 831 struct unix_sock *u = unix_sk(sk); 860 832 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 861 833 char *sun_path = sunaddr->sun_path; 862 - struct dentry *dentry = NULL; 863 - struct path path; 864 834 int err; 865 835 unsigned int hash; 866 836 struct unix_address *addr; ··· 895 869 atomic_set(&addr->refcnt, 1); 896 870 897 871 if (sun_path[0]) { 898 - umode_t mode; 899 - err = 0; 900 - /* 901 - * Get the parent directory, calculate the hash for last 902 - * component. 903 - */ 904 - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); 905 - err = PTR_ERR(dentry); 906 - if (IS_ERR(dentry)) 907 - goto out_mknod_parent; 908 - 909 - /* 910 - * All right, let's create it. 911 - */ 912 - mode = S_IFSOCK | 872 + struct path path; 873 + umode_t mode = S_IFSOCK | 913 874 (SOCK_INODE(sock)->i_mode & ~current_umask()); 914 - err = mnt_want_write(path.mnt); 915 - if (err) 916 - goto out_mknod_dput; 917 - err = security_path_mknod(&path, dentry, mode, 0); 918 - if (err) 919 - goto out_mknod_drop_write; 920 - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 921 - out_mknod_drop_write: 922 - mnt_drop_write(path.mnt); 923 - if (err) 924 - goto out_mknod_dput; 925 - mutex_unlock(&path.dentry->d_inode->i_mutex); 926 - dput(path.dentry); 927 - path.dentry = dentry; 928 - 875 + err = unix_mknod(sun_path, mode, &path); 876 + if (err) { 877 + if (err == -EEXIST) 878 + err = -EADDRINUSE; 879 + unix_release_addr(addr); 880 + goto out_up; 881 + } 929 882 addr->hash = UNIX_HASH_SIZE; 930 - } 931 - 932 - spin_lock(&unix_table_lock); 933 - 934 - if (!sun_path[0]) { 883 + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); 884 + spin_lock(&unix_table_lock); 885 + u->path = path; 886 + list = &unix_socket_table[hash]; 887 + } else { 888 + spin_lock(&unix_table_lock); 935 889 err = -EADDRINUSE; 936 890 if (__unix_find_socket_byname(net, sunaddr, addr_len, 937 891 sk->sk_type, hash)) { ··· 920 914 } 921 915 922 916 list = &unix_socket_table[addr->hash]; 923 - } else { 924 - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; 925 - u->path = path; 926 917 } 927 918 928 919 err = 0; ··· 933 930 mutex_unlock(&u->readlock); 934 931 out: 935 932 return err; 936 - 937 - out_mknod_dput: 938 - dput(dentry); 939 - mutex_unlock(&path.dentry->d_inode->i_mutex); 940 - path_put(&path); 941 - out_mknod_parent: 942 - if (err == -EEXIST) 943 - err = -EADDRINUSE; 944 - unix_release_addr(addr); 945 - goto out_up; 946 933 } 947 934 948 935 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)

+4 -4

sound/sound_firmware.c

··· 23 23 if (l <= 0 || l > 131072) 24 24 { 25 25 printk(KERN_INFO "Invalid firmware '%s'\n", fn); 26 - filp_close(filp, current->files); 26 + filp_close(filp, NULL); 27 27 return 0; 28 28 } 29 29 dp = vmalloc(l); 30 30 if (dp == NULL) 31 31 { 32 32 printk(KERN_INFO "Out of memory loading '%s'.\n", fn); 33 - filp_close(filp, current->files); 33 + filp_close(filp, NULL); 34 34 return 0; 35 35 } 36 36 pos = 0; ··· 38 38 { 39 39 printk(KERN_INFO "Failed to read '%s'.\n", fn); 40 40 vfree(dp); 41 - filp_close(filp, current->files); 41 + filp_close(filp, NULL); 42 42 return 0; 43 43 } 44 - filp_close(filp, current->files); 44 + filp_close(filp, NULL); 45 45 *fp = dp; 46 46 return (int) l; 47 47 }