Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Reimplement RLIMIT_MEMLOCK on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

Changelog

v11:
* Fix issue found by lkp robot.

v8:
* Fix issues found by lkp-tests project.

v7:
* Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred.

v6:
* Fix bug in hugetlb_file_setup() detected by trinity.

Reported-by: kernel test robot <oliver.sang@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

authored by

Alexey Gladkov and committed by
Eric W. Biederman
d7c9e99a d6469690

+53 -45
+8 -8
fs/hugetlbfs/inode.c
··· 1443 1443 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. 1444 1444 */ 1445 1445 struct file *hugetlb_file_setup(const char *name, size_t size, 1446 - vm_flags_t acctflag, struct user_struct **user, 1446 + vm_flags_t acctflag, struct ucounts **ucounts, 1447 1447 int creat_flags, int page_size_log) 1448 1448 { 1449 1449 struct inode *inode; ··· 1455 1455 if (hstate_idx < 0) 1456 1456 return ERR_PTR(-ENODEV); 1457 1457 1458 - *user = NULL; 1458 + *ucounts = NULL; 1459 1459 mnt = hugetlbfs_vfsmount[hstate_idx]; 1460 1460 if (!mnt) 1461 1461 return ERR_PTR(-ENOENT); 1462 1462 1463 1463 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 1464 - *user = current_user(); 1465 - if (user_shm_lock(size, *user)) { 1464 + *ucounts = current_ucounts(); 1465 + if (user_shm_lock(size, *ucounts)) { 1466 1466 task_lock(current); 1467 1467 pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", 1468 1468 current->comm, current->pid); 1469 1469 task_unlock(current); 1470 1470 } else { 1471 - *user = NULL; 1471 + *ucounts = NULL; 1472 1472 return ERR_PTR(-EPERM); 1473 1473 } 1474 1474 } ··· 1495 1495 1496 1496 iput(inode); 1497 1497 out: 1498 - if (*user) { 1499 - user_shm_unlock(size, *user); 1500 - *user = NULL; 1498 + if (*ucounts) { 1499 + user_shm_unlock(size, *ucounts); 1500 + *ucounts = NULL; 1501 1501 } 1502 1502 return file; 1503 1503 }
+2 -2
include/linux/hugetlb.h
··· 434 434 extern const struct file_operations hugetlbfs_file_operations; 435 435 extern const struct vm_operations_struct hugetlb_vm_ops; 436 436 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, 437 - struct user_struct **user, int creat_flags, 437 + struct ucounts **ucounts, int creat_flags, 438 438 int page_size_log); 439 439 440 440 static inline bool is_file_hugepages(struct file *file) ··· 454 454 #define is_file_hugepages(file) false 455 455 static inline struct file * 456 456 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, 457 - struct user_struct **user, int creat_flags, 457 + struct ucounts **ucounts, int creat_flags, 458 458 int page_size_log) 459 459 { 460 460 return ERR_PTR(-ENOSYS);
+2 -2
include/linux/mm.h
··· 1670 1670 #else 1671 1671 static inline bool can_do_mlock(void) { return false; } 1672 1672 #endif 1673 - extern int user_shm_lock(size_t, struct user_struct *); 1674 - extern void user_shm_unlock(size_t, struct user_struct *); 1673 + extern int user_shm_lock(size_t, struct ucounts *); 1674 + extern void user_shm_unlock(size_t, struct ucounts *); 1675 1675 1676 1676 /* 1677 1677 * Parameter block passed down to zap_pte_range in exceptional cases.
-1
include/linux/sched/user.h
··· 18 18 #ifdef CONFIG_EPOLL 19 19 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ 20 20 #endif 21 - unsigned long locked_shm; /* How many pages of mlocked shm ? */ 22 21 unsigned long unix_inflight; /* How many files in flight in unix sockets */ 23 22 atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ 24 23
+1 -1
include/linux/shmem_fs.h
··· 65 65 extern int shmem_zero_setup(struct vm_area_struct *); 66 66 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, 67 67 unsigned long len, unsigned long pgoff, unsigned long flags); 68 - extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 68 + extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); 69 69 #ifdef CONFIG_SHMEM 70 70 extern const struct address_space_operations shmem_aops; 71 71 static inline bool shmem_mapping(struct address_space *mapping)
+1
include/linux/user_namespace.h
··· 53 53 UCOUNT_RLIMIT_NPROC, 54 54 UCOUNT_RLIMIT_MSGQUEUE, 55 55 UCOUNT_RLIMIT_SIGPENDING, 56 + UCOUNT_RLIMIT_MEMLOCK, 56 57 UCOUNT_COUNTS, 57 58 }; 58 59
+13 -13
ipc/shm.c
··· 60 60 time64_t shm_ctim; 61 61 struct pid *shm_cprid; 62 62 struct pid *shm_lprid; 63 - struct user_struct *mlock_user; 63 + struct ucounts *mlock_ucounts; 64 64 65 65 /* The task created the shm object. NULL if the task is dead. */ 66 66 struct task_struct *shm_creator; ··· 286 286 shm_rmid(ns, shp); 287 287 shm_unlock(shp); 288 288 if (!is_file_hugepages(shm_file)) 289 - shmem_lock(shm_file, 0, shp->mlock_user); 290 - else if (shp->mlock_user) 289 + shmem_lock(shm_file, 0, shp->mlock_ucounts); 290 + else if (shp->mlock_ucounts) 291 291 user_shm_unlock(i_size_read(file_inode(shm_file)), 292 - shp->mlock_user); 292 + shp->mlock_ucounts); 293 293 fput(shm_file); 294 294 ipc_update_pid(&shp->shm_cprid, NULL); 295 295 ipc_update_pid(&shp->shm_lprid, NULL); ··· 625 625 626 626 shp->shm_perm.key = key; 627 627 shp->shm_perm.mode = (shmflg & S_IRWXUGO); 628 - shp->mlock_user = NULL; 628 + shp->mlock_ucounts = NULL; 629 629 630 630 shp->shm_perm.security = NULL; 631 631 error = security_shm_alloc(&shp->shm_perm); ··· 650 650 if (shmflg & SHM_NORESERVE) 651 651 acctflag = VM_NORESERVE; 652 652 file = hugetlb_file_setup(name, hugesize, acctflag, 653 - &shp->mlock_user, HUGETLB_SHMFS_INODE, 653 + &shp->mlock_ucounts, HUGETLB_SHMFS_INODE, 654 654 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); 655 655 } else { 656 656 /* ··· 698 698 no_id: 699 699 ipc_update_pid(&shp->shm_cprid, NULL); 700 700 ipc_update_pid(&shp->shm_lprid, NULL); 701 - if (is_file_hugepages(file) && shp->mlock_user) 702 - user_shm_unlock(size, shp->mlock_user); 701 + if (is_file_hugepages(file) && shp->mlock_ucounts) 702 + user_shm_unlock(size, shp->mlock_ucounts); 703 703 fput(file); 704 704 ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); 705 705 return error; ··· 1105 1105 goto out_unlock0; 1106 1106 1107 1107 if (cmd == SHM_LOCK) { 1108 - struct user_struct *user = current_user(); 1108 + struct ucounts *ucounts = current_ucounts(); 1109 1109 1110 - err = shmem_lock(shm_file, 1, user); 1110 + err = shmem_lock(shm_file, 1, ucounts); 1111 1111 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { 1112 1112 shp->shm_perm.mode |= SHM_LOCKED; 1113 - shp->mlock_user = user; 1113 + shp->mlock_ucounts = ucounts; 1114 1114 } 1115 1115 goto out_unlock0; 1116 1116 } ··· 1118 1118 /* SHM_UNLOCK */ 1119 1119 if (!(shp->shm_perm.mode & SHM_LOCKED)) 1120 1120 goto out_unlock0; 1121 - shmem_lock(shm_file, 0, shp->mlock_user); 1121 + shmem_lock(shm_file, 0, shp->mlock_ucounts); 1122 1122 shp->shm_perm.mode &= ~SHM_LOCKED; 1123 - shp->mlock_user = NULL; 1123 + shp->mlock_ucounts = NULL; 1124 1124 get_file(shm_file); 1125 1125 ipc_unlock_object(&shp->shm_perm); 1126 1126 rcu_read_unlock();
+1
kernel/fork.c
··· 825 825 init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); 826 826 init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); 827 827 init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); 828 + init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); 828 829 829 830 #ifdef CONFIG_VMAP_STACK 830 831 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
+1
kernel/ucount.c
··· 83 83 { }, 84 84 { }, 85 85 { }, 86 + { }, 86 87 { } 87 88 }; 88 89 #endif /* CONFIG_SYSCTL */
-1
kernel/user.c
··· 98 98 /* root_user.__count is 1, for init task cred */ 99 99 struct user_struct root_user = { 100 100 .__count = REFCOUNT_INIT(1), 101 - .locked_shm = 0, 102 101 .uid = GLOBAL_ROOT_UID, 103 102 .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), 104 103 };
+1
kernel/user_namespace.c
··· 125 125 ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); 126 126 ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); 127 127 ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); 128 + ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); 128 129 ns->ucounts = ucounts; 129 130 130 131 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+2 -2
mm/memfd.c
··· 297 297 } 298 298 299 299 if (flags & MFD_HUGETLB) { 300 - struct user_struct *user = NULL; 300 + struct ucounts *ucounts = NULL; 301 301 302 - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, 302 + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts, 303 303 HUGETLB_ANONHUGE_INODE, 304 304 (flags >> MFD_HUGE_SHIFT) & 305 305 MFD_HUGE_MASK);
+14 -8
mm/mlock.c
··· 817 817 */ 818 818 static DEFINE_SPINLOCK(shmlock_user_lock); 819 819 820 - int user_shm_lock(size_t size, struct user_struct *user) 820 + int user_shm_lock(size_t size, struct ucounts *ucounts) 821 821 { 822 822 unsigned long lock_limit, locked; 823 + long memlock; 823 824 int allowed = 0; 824 825 825 826 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; ··· 829 828 allowed = 1; 830 829 lock_limit >>= PAGE_SHIFT; 831 830 spin_lock(&shmlock_user_lock); 832 - if (!allowed && 833 - locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) 831 + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 832 + 833 + if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { 834 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 834 835 goto out; 835 - get_uid(user); 836 - user->locked_shm += locked; 836 + } 837 + if (!get_ucounts(ucounts)) { 838 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 839 + goto out; 840 + } 837 841 allowed = 1; 838 842 out: 839 843 spin_unlock(&shmlock_user_lock); 840 844 return allowed; 841 845 } 842 846 843 - void user_shm_unlock(size_t size, struct user_struct *user) 847 + void user_shm_unlock(size_t size, struct ucounts *ucounts) 844 848 { 845 849 spin_lock(&shmlock_user_lock); 846 - user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 850 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); 847 851 spin_unlock(&shmlock_user_lock); 848 - free_uid(user); 852 + put_ucounts(ucounts); 849 853 }
+2 -2
mm/mmap.c
··· 1605 1605 goto out_fput; 1606 1606 } 1607 1607 } else if (flags & MAP_HUGETLB) { 1608 - struct user_struct *user = NULL; 1608 + struct ucounts *ucounts = NULL; 1609 1609 struct hstate *hs; 1610 1610 1611 1611 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); ··· 1621 1621 */ 1622 1622 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 1623 1623 VM_NORESERVE, 1624 - &user, HUGETLB_ANONHUGE_INODE, 1624 + &ucounts, HUGETLB_ANONHUGE_INODE, 1625 1625 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1626 1626 if (IS_ERR(file)) 1627 1627 return PTR_ERR(file);
+5 -5
mm/shmem.c
··· 2227 2227 } 2228 2228 #endif 2229 2229 2230 - int shmem_lock(struct file *file, int lock, struct user_struct *user) 2230 + int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2231 2231 { 2232 2232 struct inode *inode = file_inode(file); 2233 2233 struct shmem_inode_info *info = SHMEM_I(inode); ··· 2239 2239 * no serialization needed when called from shm_destroy(). 2240 2240 */ 2241 2241 if (lock && !(info->flags & VM_LOCKED)) { 2242 - if (!user_shm_lock(inode->i_size, user)) 2242 + if (!user_shm_lock(inode->i_size, ucounts)) 2243 2243 goto out_nomem; 2244 2244 info->flags |= VM_LOCKED; 2245 2245 mapping_set_unevictable(file->f_mapping); 2246 2246 } 2247 - if (!lock && (info->flags & VM_LOCKED) && user) { 2248 - user_shm_unlock(inode->i_size, user); 2247 + if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2248 + user_shm_unlock(inode->i_size, ucounts); 2249 2249 info->flags &= ~VM_LOCKED; 2250 2250 mapping_clear_unevictable(file->f_mapping); 2251 2251 } ··· 4093 4093 return 0; 4094 4094 } 4095 4095 4096 - int shmem_lock(struct file *file, int lock, struct user_struct *user) 4096 + int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 4097 4097 { 4098 4098 return 0; 4099 4099 }