Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace rlimit handling update from Eric Biederman:
"This is the work mainly by Alexey Gladkov to limit rlimits to the
rlimits of the user that created a user namespace, and to allow users
to have stricter limits on the resources created within a user
namespace."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
cred: add missing return error code when set_cred_ucounts() failed
ucounts: Silence warning in dec_rlimit_ucounts
ucounts: Set ucount_max to the largest positive value the type can hold
kselftests: Add test to check for rlimit changes in different user namespaces
Reimplement RLIMIT_MEMLOCK on top of ucounts
Reimplement RLIMIT_SIGPENDING on top of ucounts
Reimplement RLIMIT_MSGQUEUE on top of ucounts
Reimplement RLIMIT_NPROC on top of ucounts
Use atomic_t for ucounts reference counting
Add a reference to ucounts for each cred
Increase size of ucounts to atomic_long_t

+469 -128
+5 -1
fs/exec.c
··· 1360 1360 WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); 1361 1361 flush_signal_handlers(me, 0); 1362 1362 1363 + retval = set_cred_ucounts(bprm->cred); 1364 + if (retval < 0) 1365 + goto out_unlock; 1366 + 1363 1367 /* 1364 1368 * install the new credentials for this executable 1365 1369 */ ··· 1878 1874 * whether NPROC limit is still exceeded. 1879 1875 */ 1880 1876 if ((current->flags & PF_NPROC_EXCEEDED) && 1881 - atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) { 1877 + is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { 1882 1878 retval = -EAGAIN; 1883 1879 goto out_ret; 1884 1880 }
+8 -8
fs/hugetlbfs/inode.c
··· 1446 1446 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. 1447 1447 */ 1448 1448 struct file *hugetlb_file_setup(const char *name, size_t size, 1449 - vm_flags_t acctflag, struct user_struct **user, 1449 + vm_flags_t acctflag, struct ucounts **ucounts, 1450 1450 int creat_flags, int page_size_log) 1451 1451 { 1452 1452 struct inode *inode; ··· 1458 1458 if (hstate_idx < 0) 1459 1459 return ERR_PTR(-ENODEV); 1460 1460 1461 - *user = NULL; 1461 + *ucounts = NULL; 1462 1462 mnt = hugetlbfs_vfsmount[hstate_idx]; 1463 1463 if (!mnt) 1464 1464 return ERR_PTR(-ENOENT); 1465 1465 1466 1466 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 1467 - *user = current_user(); 1468 - if (user_shm_lock(size, *user)) { 1467 + *ucounts = current_ucounts(); 1468 + if (user_shm_lock(size, *ucounts)) { 1469 1469 task_lock(current); 1470 1470 pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", 1471 1471 current->comm, current->pid); 1472 1472 task_unlock(current); 1473 1473 } else { 1474 - *user = NULL; 1474 + *ucounts = NULL; 1475 1475 return ERR_PTR(-EPERM); 1476 1476 } 1477 1477 } ··· 1498 1498 1499 1499 iput(inode); 1500 1500 out: 1501 - if (*user) { 1502 - user_shm_unlock(size, *user); 1503 - *user = NULL; 1501 + if (*ucounts) { 1502 + user_shm_unlock(size, *ucounts); 1503 + *ucounts = NULL; 1504 1504 } 1505 1505 return file; 1506 1506 }
+1 -1
fs/proc/array.c
··· 284 284 collect_sigign_sigcatch(p, &ignored, &caught); 285 285 num_threads = get_nr_threads(p); 286 286 rcu_read_lock(); /* FIXME: is this correct? */ 287 - qsize = atomic_read(&__task_cred(p)->user->sigpending); 287 + qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); 288 288 rcu_read_unlock(); 289 289 qlim = task_rlimit(p, RLIMIT_SIGPENDING); 290 290 unlock_task_sighand(p, &flags);
+4
include/linux/cred.h
··· 143 143 #endif 144 144 struct user_struct *user; /* real user ID subscription */ 145 145 struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ 146 + struct ucounts *ucounts; 146 147 struct group_info *group_info; /* supplementary groups for euid/fsgid */ 147 148 /* RCU deletion */ 148 149 union { ··· 170 169 extern int set_create_files_as(struct cred *, struct inode *); 171 170 extern int cred_fscmp(const struct cred *, const struct cred *); 172 171 extern void __init cred_init(void); 172 + extern int set_cred_ucounts(struct cred *); 173 173 174 174 /* 175 175 * check for validity of credentials ··· 371 369 372 370 #define task_uid(task) (task_cred_xxx((task), uid)) 373 371 #define task_euid(task) (task_cred_xxx((task), euid)) 372 + #define task_ucounts(task) (task_cred_xxx((task), ucounts)) 374 373 375 374 #define current_cred_xxx(xxx) \ 376 375 ({ \ ··· 388 385 #define current_fsgid() (current_cred_xxx(fsgid)) 389 386 #define current_cap() (current_cred_xxx(cap_effective)) 390 387 #define current_user() (current_cred_xxx(user)) 388 + #define current_ucounts() (current_cred_xxx(ucounts)) 391 389 392 390 extern struct user_namespace init_user_ns; 393 391 #ifdef CONFIG_USER_NS
+2 -2
include/linux/hugetlb.h
··· 451 451 extern const struct file_operations hugetlbfs_file_operations; 452 452 extern const struct vm_operations_struct hugetlb_vm_ops; 453 453 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, 454 - struct user_struct **user, int creat_flags, 454 + struct ucounts **ucounts, int creat_flags, 455 455 int page_size_log); 456 456 457 457 static inline bool is_file_hugepages(struct file *file) ··· 471 471 #define is_file_hugepages(file) false 472 472 static inline struct file * 473 473 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, 474 - struct user_struct **user, int creat_flags, 474 + struct ucounts **ucounts, int creat_flags, 475 475 int page_size_log) 476 476 { 477 477 return ERR_PTR(-ENOSYS);
+2 -2
include/linux/mm.h
··· 1709 1709 #else 1710 1710 static inline bool can_do_mlock(void) { return false; } 1711 1711 #endif 1712 - extern int user_shm_lock(size_t, struct user_struct *); 1713 - extern void user_shm_unlock(size_t, struct user_struct *); 1712 + extern int user_shm_lock(size_t, struct ucounts *); 1713 + extern void user_shm_unlock(size_t, struct ucounts *); 1714 1714 1715 1715 /* 1716 1716 * Parameter block passed down to zap_pte_range in exceptional cases.
-7
include/linux/sched/user.h
··· 12 12 */ 13 13 struct user_struct { 14 14 refcount_t __count; /* reference count */ 15 - atomic_t processes; /* How many processes does this user have? */ 16 - atomic_t sigpending; /* How many pending signals does this user have? */ 17 15 #ifdef CONFIG_EPOLL 18 16 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ 19 17 #endif 20 - #ifdef CONFIG_POSIX_MQUEUE 21 - /* protected by mq_lock */ 22 - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ 23 - #endif 24 - unsigned long locked_shm; /* How many pages of mlocked shm ? */ 25 18 unsigned long unix_inflight; /* How many files in flight in unix sockets */ 26 19 atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ 27 20
+1 -1
include/linux/shmem_fs.h
··· 65 65 extern int shmem_zero_setup(struct vm_area_struct *); 66 66 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, 67 67 unsigned long len, unsigned long pgoff, unsigned long flags); 68 - extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 68 + extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); 69 69 #ifdef CONFIG_SHMEM 70 70 extern const struct address_space_operations shmem_aops; 71 71 static inline bool shmem_mapping(struct address_space *mapping)
+3 -1
include/linux/signal_types.h
··· 13 13 __SIGINFO; 14 14 } kernel_siginfo_t; 15 15 16 + struct ucounts; 17 + 16 18 /* 17 19 * Real Time signals may be queued. 18 20 */ ··· 23 21 struct list_head list; 24 22 int flags; 25 23 kernel_siginfo_t info; 26 - struct user_struct *user; 24 + struct ucounts *ucounts; 27 25 }; 28 26 29 27 /* flags values. */
+28 -3
include/linux/user_namespace.h
··· 54 54 UCOUNT_FANOTIFY_GROUPS, 55 55 UCOUNT_FANOTIFY_MARKS, 56 56 #endif 57 + UCOUNT_RLIMIT_NPROC, 58 + UCOUNT_RLIMIT_MSGQUEUE, 59 + UCOUNT_RLIMIT_SIGPENDING, 60 + UCOUNT_RLIMIT_MEMLOCK, 57 61 UCOUNT_COUNTS, 58 62 }; 63 + 64 + #define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC 59 65 60 66 struct user_namespace { 61 67 struct uid_gid_map uid_map; ··· 98 92 struct ctl_table_header *sysctls; 99 93 #endif 100 94 struct ucounts *ucounts; 101 - int ucount_max[UCOUNT_COUNTS]; 95 + long ucount_max[UCOUNT_COUNTS]; 102 96 } __randomize_layout; 103 97 104 98 struct ucounts { 105 99 struct hlist_node node; 106 100 struct user_namespace *ns; 107 101 kuid_t uid; 108 - int count; 109 - atomic_t ucount[UCOUNT_COUNTS]; 102 + atomic_t count; 103 + atomic_long_t ucount[UCOUNT_COUNTS]; 110 104 }; 111 105 112 106 extern struct user_namespace init_user_ns; 107 + extern struct ucounts init_ucounts; 113 108 114 109 bool setup_userns_sysctls(struct user_namespace *ns); 115 110 void retire_userns_sysctls(struct user_namespace *ns); 116 111 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); 117 112 void dec_ucount(struct ucounts *ucounts, enum ucount_type type); 113 + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); 114 + struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); 115 + void put_ucounts(struct ucounts *ucounts); 116 + 117 + static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type) 118 + { 119 + return atomic_long_read(&ucounts->ucount[type]); 120 + } 121 + 122 + long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); 123 + bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); 124 + bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); 125 + 126 + static inline void set_rlimit_ucount_max(struct user_namespace *ns, 127 + enum ucount_type type, unsigned long max) 128 + { 129 + ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX; 130 + } 118 131 119 132 #ifdef CONFIG_USER_NS 120 133
+22 -20
ipc/mqueue.c
··· 144 144 struct pid *notify_owner; 145 145 u32 notify_self_exec_id; 146 146 struct user_namespace *notify_user_ns; 147 - struct user_struct *user; /* user who created, for accounting */ 147 + struct ucounts *ucounts; /* user who created, for accounting */ 148 148 struct sock *notify_sock; 149 149 struct sk_buff *notify_cookie; 150 150 ··· 292 292 struct ipc_namespace *ipc_ns, umode_t mode, 293 293 struct mq_attr *attr) 294 294 { 295 - struct user_struct *u = current_user(); 296 295 struct inode *inode; 297 296 int ret = -ENOMEM; 298 297 ··· 320 321 info->notify_owner = NULL; 321 322 info->notify_user_ns = NULL; 322 323 info->qsize = 0; 323 - info->user = NULL; /* set when all is ok */ 324 + info->ucounts = NULL; /* set when all is ok */ 324 325 info->msg_tree = RB_ROOT; 325 326 info->msg_tree_rightmost = NULL; 326 327 info->node_cache = NULL; ··· 370 371 if (mq_bytes + mq_treesize < mq_bytes) 371 372 goto out_inode; 372 373 mq_bytes += mq_treesize; 373 - spin_lock(&mq_lock); 374 - if (u->mq_bytes + mq_bytes < u->mq_bytes || 375 - u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { 376 - spin_unlock(&mq_lock); 377 - /* mqueue_evict_inode() releases info->messages */ 378 - ret = -EMFILE; 379 - goto out_inode; 380 - } 381 - u->mq_bytes += mq_bytes; 382 - spin_unlock(&mq_lock); 374 + info->ucounts = get_ucounts(current_ucounts()); 375 + if (info->ucounts) { 376 + long msgqueue; 383 377 384 - /* all is ok */ 385 - info->user = get_uid(u); 378 + spin_lock(&mq_lock); 379 + msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); 380 + if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { 381 + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); 382 + spin_unlock(&mq_lock); 383 + put_ucounts(info->ucounts); 384 + info->ucounts = NULL; 385 + /* mqueue_evict_inode() releases info->messages */ 386 + ret = -EMFILE; 387 + goto out_inode; 388 + } 389 + spin_unlock(&mq_lock); 390 + } 386 391 } else if (S_ISDIR(mode)) { 387 392 inc_nlink(inode); 388 393 /* Some things misbehave if size == 0 on a directory */ ··· 500 497 static void mqueue_evict_inode(struct inode *inode) 501 498 { 502 499 struct mqueue_inode_info *info; 503 - struct user_struct *user; 504 500 struct ipc_namespace *ipc_ns; 505 501 struct msg_msg *msg, *nmsg; 506 502 LIST_HEAD(tmp_msg); ··· 522 520 free_msg(msg); 523 521 } 524 522 525 - user = info->user; 526 - if (user) { 523 + if (info->ucounts) { 527 524 unsigned long mq_bytes, mq_treesize; 528 525 529 526 /* Total amount of bytes accounted for the mqueue */ ··· 534 533 info->attr.mq_msgsize); 535 534 536 535 spin_lock(&mq_lock); 537 - user->mq_bytes -= mq_bytes; 536 + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); 538 537 /* 539 538 * get_ns_from_inode() ensures that the 540 539 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns ··· 544 543 if (ipc_ns) 545 544 ipc_ns->mq_queues_count--; 546 545 spin_unlock(&mq_lock); 547 - free_uid(user); 546 + put_ucounts(info->ucounts); 547 + info->ucounts = NULL; 548 548 } 549 549 if (ipc_ns) 550 550 put_ipc_ns(ipc_ns);
+13 -13
ipc/shm.c
··· 60 60 time64_t shm_ctim; 61 61 struct pid *shm_cprid; 62 62 struct pid *shm_lprid; 63 - struct user_struct *mlock_user; 63 + struct ucounts *mlock_ucounts; 64 64 65 65 /* The task created the shm object. NULL if the task is dead. */ 66 66 struct task_struct *shm_creator; ··· 286 286 shm_rmid(ns, shp); 287 287 shm_unlock(shp); 288 288 if (!is_file_hugepages(shm_file)) 289 - shmem_lock(shm_file, 0, shp->mlock_user); 290 - else if (shp->mlock_user) 289 + shmem_lock(shm_file, 0, shp->mlock_ucounts); 290 + else if (shp->mlock_ucounts) 291 291 user_shm_unlock(i_size_read(file_inode(shm_file)), 292 - shp->mlock_user); 292 + shp->mlock_ucounts); 293 293 fput(shm_file); 294 294 ipc_update_pid(&shp->shm_cprid, NULL); 295 295 ipc_update_pid(&shp->shm_lprid, NULL); ··· 625 625 626 626 shp->shm_perm.key = key; 627 627 shp->shm_perm.mode = (shmflg & S_IRWXUGO); 628 - shp->mlock_user = NULL; 628 + shp->mlock_ucounts = NULL; 629 629 630 630 shp->shm_perm.security = NULL; 631 631 error = security_shm_alloc(&shp->shm_perm); ··· 650 650 if (shmflg & SHM_NORESERVE) 651 651 acctflag = VM_NORESERVE; 652 652 file = hugetlb_file_setup(name, hugesize, acctflag, 653 - &shp->mlock_user, HUGETLB_SHMFS_INODE, 653 + &shp->mlock_ucounts, HUGETLB_SHMFS_INODE, 654 654 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); 655 655 } else { 656 656 /* ··· 698 698 no_id: 699 699 ipc_update_pid(&shp->shm_cprid, NULL); 700 700 ipc_update_pid(&shp->shm_lprid, NULL); 701 - if (is_file_hugepages(file) && shp->mlock_user) 702 - user_shm_unlock(size, shp->mlock_user); 701 + if (is_file_hugepages(file) && shp->mlock_ucounts) 702 + user_shm_unlock(size, shp->mlock_ucounts); 703 703 fput(file); 704 704 ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); 705 705 return error; ··· 1105 1105 goto out_unlock0; 1106 1106 1107 1107 if (cmd == SHM_LOCK) { 1108 - struct user_struct *user = current_user(); 1108 + struct ucounts *ucounts = current_ucounts(); 1109 1109 1110 - err = shmem_lock(shm_file, 1, user); 1110 + err = shmem_lock(shm_file, 1, ucounts); 1111 1111 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { 1112 1112 shp->shm_perm.mode |= SHM_LOCKED; 1113 - shp->mlock_user = user; 1113 + shp->mlock_ucounts = ucounts; 1114 1114 } 1115 1115 goto out_unlock0; 1116 1116 } ··· 1118 1118 /* SHM_UNLOCK */ 1119 1119 if (!(shp->shm_perm.mode & SHM_LOCKED)) 1120 1120 goto out_unlock0; 1121 - shmem_lock(shm_file, 0, shp->mlock_user); 1121 + shmem_lock(shm_file, 0, shp->mlock_ucounts); 1122 1122 shp->shm_perm.mode &= ~SHM_LOCKED; 1123 - shp->mlock_user = NULL; 1123 + shp->mlock_ucounts = NULL; 1124 1124 get_file(shm_file); 1125 1125 ipc_unlock_object(&shp->shm_perm); 1126 1126 rcu_read_unlock();
+46 -5
kernel/cred.c
··· 60 60 .user = INIT_USER, 61 61 .user_ns = &init_user_ns, 62 62 .group_info = &init_groups, 63 + .ucounts = &init_ucounts, 63 64 }; 64 65 65 66 static inline void set_cred_subscribers(struct cred *cred, int n) ··· 120 119 if (cred->group_info) 121 120 put_group_info(cred->group_info); 122 121 free_uid(cred->user); 122 + if (cred->ucounts) 123 + put_ucounts(cred->ucounts); 123 124 put_user_ns(cred->user_ns); 124 125 kmem_cache_free(cred_jar, cred); 125 126 } ··· 225 222 #ifdef CONFIG_DEBUG_CREDENTIALS 226 223 new->magic = CRED_MAGIC; 227 224 #endif 225 + new->ucounts = get_ucounts(&init_ucounts); 228 226 229 227 if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) 230 228 goto error; ··· 288 284 289 285 if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) 290 286 goto error; 287 + 288 + new->ucounts = get_ucounts(new->ucounts); 289 + if (!new->ucounts) 290 + goto error; 291 + 291 292 validate_creds(new); 292 293 return new; 293 294 ··· 360 351 kdebug("share_creds(%p{%d,%d})", 361 352 p->cred, atomic_read(&p->cred->usage), 362 353 read_cred_subscribers(p->cred)); 363 - atomic_inc(&p->cred->user->processes); 354 + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 364 355 return 0; 365 356 } 366 357 ··· 370 361 371 362 if (clone_flags & CLONE_NEWUSER) { 372 363 ret = create_user_ns(new); 364 + if (ret < 0) 365 + goto error_put; 366 + ret = set_cred_ucounts(new); 373 367 if (ret < 0) 374 368 goto error_put; 375 369 } ··· 396 384 } 397 385 #endif 398 386 399 - atomic_inc(&new->user->processes); 400 387 p->cred = p->real_cred = get_cred(new); 388 + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 401 389 alter_cred_subscribers(new, 2); 402 390 validate_creds(new); 403 391 return 0; ··· 497 485 * in set_user(). 498 486 */ 499 487 alter_cred_subscribers(new, 2); 500 - if (new->user != old->user) 501 - atomic_inc(&new->user->processes); 488 + if (new->user != old->user || new->user_ns != old->user_ns) 489 + inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); 502 490 rcu_assign_pointer(task->real_cred, new); 503 491 rcu_assign_pointer(task->cred, new); 504 492 if (new->user != old->user) 505 - atomic_dec(&old->user->processes); 493 + dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); 506 494 alter_cred_subscribers(old, -2); 507 495 508 496 /* send notifications */ ··· 665 653 } 666 654 EXPORT_SYMBOL(cred_fscmp); 667 655 656 + int set_cred_ucounts(struct cred *new) 657 + { 658 + struct task_struct *task = current; 659 + const struct cred *old = task->real_cred; 660 + struct ucounts *old_ucounts = new->ucounts; 661 + 662 + if (new->user == old->user && new->user_ns == old->user_ns) 663 + return 0; 664 + 665 + /* 666 + * This optimization is needed because alloc_ucounts() uses locks 667 + * for table lookups. 668 + */ 669 + if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) 670 + return 0; 671 + 672 + if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) 673 + return -EAGAIN; 674 + 675 + if (old_ucounts) 676 + put_ucounts(old_ucounts); 677 + 678 + return 0; 679 + } 680 + 668 681 /* 669 682 * initialise the credentials stuff 670 683 */ ··· 754 717 new->security = NULL; 755 718 #endif 756 719 if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) 720 + goto error; 721 + 722 + new->ucounts = get_ucounts(new->ucounts); 723 + if (!new->ucounts) 757 724 goto error; 758 725 759 726 put_cred(old);
+1 -1
kernel/exit.c
··· 188 188 /* don't need to get the RCU readlock here - the process is dead and 189 189 * can't be modifying its own credentials. But shut RCU-lockdep up */ 190 190 rcu_read_lock(); 191 - atomic_dec(&__task_cred(p)->user->processes); 191 + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 192 192 rcu_read_unlock(); 193 193 194 194 cgroup_release(p);
+14 -4
kernel/fork.c
··· 825 825 init_task.signal->rlim[RLIMIT_SIGPENDING] = 826 826 init_task.signal->rlim[RLIMIT_NPROC]; 827 827 828 - for (i = 0; i < UCOUNT_COUNTS; i++) 828 + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) 829 829 init_user_ns.ucount_max[i] = max_threads/2; 830 + 831 + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); 832 + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); 833 + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); 834 + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); 830 835 831 836 #ifdef CONFIG_VMAP_STACK 832 837 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", ··· 1983 1978 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 1984 1979 #endif 1985 1980 retval = -EAGAIN; 1986 - if (atomic_read(&p->real_cred->user->processes) >= 1987 - task_rlimit(p, RLIMIT_NPROC)) { 1981 + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { 1988 1982 if (p->real_cred->user != INIT_USER && 1989 1983 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) 1990 1984 goto bad_fork_free; ··· 2392 2388 #endif 2393 2389 delayacct_tsk_free(p); 2394 2390 bad_fork_cleanup_count: 2395 - atomic_dec(&p->cred->user->processes); 2391 + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 2396 2392 exit_creds(p); 2397 2393 bad_fork_free: 2398 2394 WRITE_ONCE(p->__state, TASK_DEAD); ··· 3004 3000 new_cred, new_fs); 3005 3001 if (err) 3006 3002 goto bad_unshare_cleanup_cred; 3003 + 3004 + if (new_cred) { 3005 + err = set_cred_ucounts(new_cred); 3006 + if (err) 3007 + goto bad_unshare_cleanup_cred; 3008 + } 3007 3009 3008 3010 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { 3009 3011 if (do_sysvsem) {
+13 -12
kernel/signal.c
··· 412 412 int override_rlimit, const unsigned int sigqueue_flags) 413 413 { 414 414 struct sigqueue *q = NULL; 415 - struct user_struct *user; 416 - int sigpending; 415 + struct ucounts *ucounts = NULL; 416 + long sigpending; 417 417 418 418 /* 419 419 * Protect access to @t credentials. This can go away when all ··· 424 424 * changes from/to zero. 425 425 */ 426 426 rcu_read_lock(); 427 - user = __task_cred(t)->user; 428 - sigpending = atomic_inc_return(&user->sigpending); 427 + ucounts = task_ucounts(t); 428 + sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); 429 429 if (sigpending == 1) 430 - get_uid(user); 430 + ucounts = get_ucounts(ucounts); 431 431 rcu_read_unlock(); 432 432 433 - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { 433 + if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { 434 434 q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); 435 435 } else { 436 436 print_dropped_signal(sig); 437 437 } 438 438 439 439 if (unlikely(q == NULL)) { 440 - if (atomic_dec_and_test(&user->sigpending)) 441 - free_uid(user); 440 + if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) 441 + put_ucounts(ucounts); 442 442 } else { 443 443 INIT_LIST_HEAD(&q->list); 444 444 q->flags = sigqueue_flags; 445 - q->user = user; 445 + q->ucounts = ucounts; 446 446 } 447 - 448 447 return q; 449 448 } 450 449 ··· 451 452 { 452 453 if (q->flags & SIGQUEUE_PREALLOC) 453 454 return; 454 - if (atomic_dec_and_test(&q->user->sigpending)) 455 - free_uid(q->user); 455 + if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { 456 + put_ucounts(q->ucounts); 457 + q->ucounts = NULL; 458 + } 456 459 kmem_cache_free(sigqueue_cachep, q); 457 460 } 458 461
+13 -1
kernel/sys.c
··· 479 479 * for programs doing set*uid()+execve() by harmlessly deferring the 480 480 * failure to the execve() stage. 481 481 */ 482 - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 482 + if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && 483 483 new_user != INIT_USER) 484 484 current->flags |= PF_NPROC_EXCEEDED; 485 485 else ··· 558 558 if (retval < 0) 559 559 goto error; 560 560 561 + retval = set_cred_ucounts(new); 562 + if (retval < 0) 563 + goto error; 564 + 561 565 return commit_creds(new); 562 566 563 567 error: ··· 617 613 new->fsuid = new->euid = kuid; 618 614 619 615 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 616 + if (retval < 0) 617 + goto error; 618 + 619 + retval = set_cred_ucounts(new); 620 620 if (retval < 0) 621 621 goto error; 622 622 ··· 696 688 new->fsuid = new->euid; 697 689 698 690 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 691 + if (retval < 0) 692 + goto error; 693 + 694 + retval = set_cred_ucounts(new); 699 695 if (retval < 0) 700 696 goto error; 701 697
+91 -25
kernel/ucount.c
··· 8 8 #include <linux/kmemleak.h> 9 9 #include <linux/user_namespace.h> 10 10 11 + struct ucounts init_ucounts = { 12 + .ns = &init_user_ns, 13 + .uid = GLOBAL_ROOT_UID, 14 + .count = ATOMIC_INIT(1), 15 + }; 16 + 11 17 #define UCOUNTS_HASHTABLE_BITS 10 12 18 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; 13 19 static DEFINE_SPINLOCK(ucounts_lock); ··· 84 78 UCOUNT_ENTRY("max_fanotify_groups"), 85 79 UCOUNT_ENTRY("max_fanotify_marks"), 86 80 #endif 81 + { }, 82 + { }, 83 + { }, 84 + { }, 87 85 { } 88 86 }; 89 87 #endif /* CONFIG_SYSCTL */ ··· 139 129 return NULL; 140 130 } 141 131 142 - static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) 132 + static void hlist_add_ucounts(struct ucounts *ucounts) 133 + { 134 + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); 135 + spin_lock_irq(&ucounts_lock); 136 + hlist_add_head(&ucounts->node, hashent); 137 + spin_unlock_irq(&ucounts_lock); 138 + } 139 + 140 + struct ucounts *get_ucounts(struct ucounts *ucounts) 141 + { 142 + if (ucounts && atomic_add_negative(1, &ucounts->count)) { 143 + put_ucounts(ucounts); 144 + ucounts = NULL; 145 + } 146 + return ucounts; 147 + } 148 + 149 + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) 143 150 { 144 151 struct hlist_head *hashent = ucounts_hashentry(ns, uid); 145 152 struct ucounts *ucounts, *new; ··· 172 145 173 146 new->ns = ns; 174 147 new->uid = uid; 175 - new->count = 0; 148 + atomic_set(&new->count, 1); 176 149 177 150 spin_lock_irq(&ucounts_lock); 178 151 ucounts = find_ucounts(ns, uid, hashent); ··· 180 153 kfree(new); 181 154 } else { 182 155 hlist_add_head(&new->node, hashent); 183 - ucounts = new; 156 + spin_unlock_irq(&ucounts_lock); 157 + return new; 184 158 } 185 159 } 186 - if (ucounts->count == INT_MAX) 187 - ucounts = NULL; 188 - else 189 - ucounts->count += 1; 190 160 spin_unlock_irq(&ucounts_lock); 161 + ucounts = get_ucounts(ucounts); 191 162 return ucounts; 192 163 } 193 164 194 - static void put_ucounts(struct ucounts *ucounts) 165 + void put_ucounts(struct ucounts *ucounts) 195 166 { 196 167 unsigned long flags; 197 168 198 - spin_lock_irqsave(&ucounts_lock, flags); 199 - ucounts->count -= 1; 200 - if (!ucounts->count) 169 + if (atomic_dec_and_test(&ucounts->count)) { 170 + spin_lock_irqsave(&ucounts_lock, flags); 201 171 hlist_del_init(&ucounts->node); 202 - else 203 - ucounts = NULL; 204 - spin_unlock_irqrestore(&ucounts_lock, flags); 205 - 206 - kfree(ucounts); 172 + spin_unlock_irqrestore(&ucounts_lock, flags); 173 + kfree(ucounts); 174 + } 207 175 } 208 176 209 - static inline bool atomic_inc_below(atomic_t *v, int u) 177 + static inline bool atomic_long_inc_below(atomic_long_t *v, int u) 210 178 { 211 - int c, old; 212 - c = atomic_read(v); 179 + long c, old; 180 + c = atomic_long_read(v); 213 181 for (;;) { 214 182 if (unlikely(c >= u)) 215 183 return false; 216 - old = atomic_cmpxchg(v, c, c+1); 184 + old = atomic_long_cmpxchg(v, c, c+1); 217 185 if (likely(old == c)) 218 186 return true; 219 187 c = old; ··· 220 198 { 221 199 struct ucounts *ucounts, *iter, *bad; 222 200 struct user_namespace *tns; 223 - ucounts = get_ucounts(ns, uid); 201 + ucounts = alloc_ucounts(ns, uid); 224 202 for (iter = ucounts; iter; iter = tns->ucounts) { 225 - int max; 203 + long max; 226 204 tns = iter->ns; 227 205 max = READ_ONCE(tns->ucount_max[type]); 228 - if (!atomic_inc_below(&iter->ucount[type], max)) 206 + if (!atomic_long_inc_below(&iter->ucount[type], max)) 229 207 goto fail; 230 208 } 231 209 return ucounts; 232 210 fail: 233 211 bad = iter; 234 212 for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) 235 - atomic_dec(&iter->ucount[type]); 213 + atomic_long_dec(&iter->ucount[type]); 236 214 237 215 put_ucounts(ucounts); 238 216 return NULL; ··· 242 220 { 243 221 struct ucounts *iter; 244 222 for (iter = ucounts; iter; iter = iter->ns->ucounts) { 245 - int dec = atomic_dec_if_positive(&iter->ucount[type]); 223 + long dec = atomic_long_dec_if_positive(&iter->ucount[type]); 246 224 WARN_ON_ONCE(dec < 0); 247 225 } 248 226 put_ucounts(ucounts); 227 + } 228 + 229 + long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) 230 + { 231 + struct ucounts *iter; 232 + long ret = 0; 233 + 234 + for (iter = ucounts; iter; iter = iter->ns->ucounts) { 235 + long max = READ_ONCE(iter->ns->ucount_max[type]); 236 + long new = atomic_long_add_return(v, &iter->ucount[type]); 237 + if (new < 0 || new > max) 238 + ret = LONG_MAX; 239 + else if (iter == ucounts) 240 + ret = new; 241 + } 242 + return ret; 243 + } 244 + 245 + bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) 246 + { 247 + struct ucounts *iter; 248 + long new = -1; /* Silence compiler warning */ 249 + for (iter = ucounts; iter; iter = iter->ns->ucounts) { 250 + long dec = atomic_long_add_return(-v, &iter->ucount[type]); 251 + WARN_ON_ONCE(dec < 0); 252 + if (iter == ucounts) 253 + new = dec; 254 + } 255 + return (new == 0); 256 + } 257 + 258 + bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) 259 + { 260 + struct ucounts *iter; 261 + if (get_ucounts_value(ucounts, type) > max) 262 + return true; 263 + for (iter = ucounts; iter; iter = iter->ns->ucounts) { 264 + max = READ_ONCE(iter->ns->ucount_max[type]); 265 + if (get_ucounts_value(iter, type) > max) 266 + return true; 267 + } 268 + return false; 249 269 } 250 270 251 271 static __init int user_namespace_sysctl_init(void) ··· 305 241 BUG_ON(!user_header); 306 242 BUG_ON(!setup_userns_sysctls(&init_user_ns)); 307 243 #endif 244 + hlist_add_ucounts(&init_ucounts); 245 + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); 308 246 return 0; 309 247 } 310 248 subsys_initcall(user_namespace_sysctl_init);
-3
kernel/user.c
··· 98 98 /* root_user.__count is 1, for init task cred */ 99 99 struct user_struct root_user = { 100 100 .__count = REFCOUNT_INIT(1), 101 - .processes = ATOMIC_INIT(1), 102 - .sigpending = ATOMIC_INIT(0), 103 - .locked_shm = 0, 104 101 .uid = GLOBAL_ROOT_UID, 105 102 .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), 106 103 };
+8 -1
kernel/user_namespace.c
··· 119 119 ns->owner = owner; 120 120 ns->group = group; 121 121 INIT_WORK(&ns->work, free_user_ns); 122 - for (i = 0; i < UCOUNT_COUNTS; i++) { 122 + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { 123 123 ns->ucount_max[i] = INT_MAX; 124 124 } 125 + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); 126 + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); 127 + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); 128 + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); 125 129 ns->ucounts = ucounts; 126 130 127 131 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ ··· 1343 1339 1344 1340 put_user_ns(cred->user_ns); 1345 1341 set_cred_user_ns(cred, get_user_ns(user_ns)); 1342 + 1343 + if (set_cred_ucounts(cred) < 0) 1344 + return -EINVAL; 1346 1345 1347 1346 return 0; 1348 1347 }
+2 -2
mm/memfd.c
··· 297 297 } 298 298 299 299 if (flags & MFD_HUGETLB) { 300 - struct user_struct *user = NULL; 300 + struct ucounts *ucounts = NULL; 301 301 302 - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, 302 + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts, 303 303 HUGETLB_ANONHUGE_INODE, 304 304 (flags >> MFD_HUGE_SHIFT) & 305 305 MFD_HUGE_MASK);
+14 -8
mm/mlock.c
··· 817 817 */ 818 818 static DEFINE_SPINLOCK(shmlock_user_lock); 819 819 820 - int user_shm_lock(size_t size, struct user_struct *user) 820 + int user_shm_lock(size_t size, struct ucounts *ucounts) 821 821 { 822 822 unsigned long lock_limit, locked; 823 + long memlock; 823 824 int allowed = 0; 824 825 825 826 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; ··· 829 828 allowed = 1; 830 829 lock_limit >>= PAGE_SHIFT; 831 830 spin_lock(&shmlock_user_lock); 832 - if (!allowed && 833 - locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) 831 + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 832 + 833 + if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { 834 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 834 835 goto out; 835 - get_uid(user); 836 - user->locked_shm += locked; 836 + } 837 + if (!get_ucounts(ucounts)) { 838 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); 839 + goto out; 840 + } 837 841 allowed = 1; 838 842 out: 839 843 spin_unlock(&shmlock_user_lock); 840 844 return allowed; 841 845 } 842 846 843 - void user_shm_unlock(size_t size, struct user_struct *user) 847 + void user_shm_unlock(size_t size, struct ucounts *ucounts) 844 848 { 845 849 spin_lock(&shmlock_user_lock); 846 - user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 850 + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); 847 851 spin_unlock(&shmlock_user_lock); 848 - free_uid(user); 852 + put_ucounts(ucounts); 849 853 }
+2 -2
mm/mmap.c
··· 1611 1611 goto out_fput; 1612 1612 } 1613 1613 } else if (flags & MAP_HUGETLB) { 1614 - struct user_struct *user = NULL; 1614 + struct ucounts *ucounts = NULL; 1615 1615 struct hstate *hs; 1616 1616 1617 1617 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); ··· 1627 1627 */ 1628 1628 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 1629 1629 VM_NORESERVE, 1630 - &user, HUGETLB_ANONHUGE_INODE, 1630 + &ucounts, HUGETLB_ANONHUGE_INODE, 1631 1631 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1632 1632 if (IS_ERR(file)) 1633 1633 return PTR_ERR(file);
+5 -5
mm/shmem.c
··· 2227 2227 } 2228 2228 #endif 2229 2229 2230 - int shmem_lock(struct file *file, int lock, struct user_struct *user) 2230 + int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2231 2231 { 2232 2232 struct inode *inode = file_inode(file); 2233 2233 struct shmem_inode_info *info = SHMEM_I(inode); ··· 2239 2239 * no serialization needed when called from shm_destroy(). 2240 2240 */ 2241 2241 if (lock && !(info->flags & VM_LOCKED)) { 2242 - if (!user_shm_lock(inode->i_size, user)) 2242 + if (!user_shm_lock(inode->i_size, ucounts)) 2243 2243 goto out_nomem; 2244 2244 info->flags |= VM_LOCKED; 2245 2245 mapping_set_unevictable(file->f_mapping); 2246 2246 } 2247 - if (!lock && (info->flags & VM_LOCKED) && user) { 2248 - user_shm_unlock(inode->i_size, user); 2247 + if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2248 + user_shm_unlock(inode->i_size, ucounts); 2249 2249 info->flags &= ~VM_LOCKED; 2250 2250 mapping_clear_unevictable(file->f_mapping); 2251 2251 } ··· 4092 4092 return 0; 4093 4093 } 4094 4094 4095 - int shmem_lock(struct file *file, int lock, struct user_struct *user) 4095 + int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 4096 4096 { 4097 4097 return 0; 4098 4098 }
+1
tools/testing/selftests/Makefile
··· 49 49 TARGETS += pstore 50 50 TARGETS += ptrace 51 51 TARGETS += openat2 52 + TARGETS += rlimits 52 53 TARGETS += rseq 53 54 TARGETS += rtc 54 55 TARGETS += seccomp
+2
tools/testing/selftests/rlimits/.gitignore
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + rlimits-per-userns
+6
tools/testing/selftests/rlimits/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + CFLAGS += -Wall -O2 -g 4 + TEST_GEN_PROGS := rlimits-per-userns 5 + 6 + include ../lib.mk
+1
tools/testing/selftests/rlimits/config
··· 1 + CONFIG_USER_NS=y
+161
tools/testing/selftests/rlimits/rlimits-per-userns.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Author: Alexey Gladkov <gladkov.alexey@gmail.com> 4 + */ 5 + #define _GNU_SOURCE 6 + #include <sys/types.h> 7 + #include <sys/wait.h> 8 + #include <sys/time.h> 9 + #include <sys/resource.h> 10 + #include <sys/prctl.h> 11 + #include <sys/stat.h> 12 + 13 + #include <unistd.h> 14 + #include <stdlib.h> 15 + #include <stdio.h> 16 + #include <string.h> 17 + #include <sched.h> 18 + #include <signal.h> 19 + #include <limits.h> 20 + #include <fcntl.h> 21 + #include <errno.h> 22 + #include <err.h> 23 + 24 + #define NR_CHILDS 2 25 + 26 + static char *service_prog; 27 + static uid_t user = 60000; 28 + static uid_t group = 60000; 29 + 30 + static void setrlimit_nproc(rlim_t n) 31 + { 32 + pid_t pid = getpid(); 33 + struct rlimit limit = { 34 + .rlim_cur = n, 35 + .rlim_max = n 36 + }; 37 + 38 + warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n); 39 + 40 + if (setrlimit(RLIMIT_NPROC, &limit) < 0) 41 + err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid); 42 + } 43 + 44 + static pid_t fork_child(void) 45 + { 46 + pid_t pid = fork(); 47 + 48 + if (pid < 0) 49 + err(EXIT_FAILURE, "fork"); 50 + 51 + if (pid > 0) 52 + return pid; 53 + 54 + pid = getpid(); 55 + 56 + warnx("(pid=%d): New process starting ...", pid); 57 + 58 + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) 59 + err(EXIT_FAILURE, "(pid=%d): prctl(PR_SET_PDEATHSIG)", pid); 60 + 61 + signal(SIGUSR1, SIG_DFL); 62 + 63 + warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group); 64 + 65 + if (setgid(group) < 0) 66 + err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group); 67 + if (setuid(user) < 0) 68 + err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user); 69 + 70 + warnx("(pid=%d): Service running ...", pid); 71 + 72 + warnx("(pid=%d): Unshare user namespace", pid); 73 + if (unshare(CLONE_NEWUSER) < 0) 74 + err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)"); 75 + 76 + char *const argv[] = { "service", NULL }; 77 + char *const envp[] = { "I_AM_SERVICE=1", NULL }; 78 + 79 + warnx("(pid=%d): Executing real service ...", pid); 80 + 81 + execve(service_prog, argv, envp); 82 + err(EXIT_FAILURE, "(pid=%d): execve", pid); 83 + } 84 + 85 + int main(int argc, char **argv) 86 + { 87 + size_t i; 88 + pid_t child[NR_CHILDS]; 89 + int wstatus[NR_CHILDS]; 90 + int childs = NR_CHILDS; 91 + pid_t pid; 92 + 93 + if (getenv("I_AM_SERVICE")) { 94 + pause(); 95 + exit(EXIT_SUCCESS); 96 + } 97 + 98 + service_prog = argv[0]; 99 + pid = getpid(); 100 + 101 + warnx("(pid=%d) Starting testcase", pid); 102 + 103 + /* 104 + * This rlimit is not a problem for root because it can be exceeded. 105 + */ 106 + setrlimit_nproc(1); 107 + 108 + for (i = 0; i < NR_CHILDS; i++) { 109 + child[i] = fork_child(); 110 + wstatus[i] = 0; 111 + usleep(250000); 112 + } 113 + 114 + while (1) { 115 + for (i = 0; i < NR_CHILDS; i++) { 116 + if (child[i] <= 0) 117 + continue; 118 + 119 + errno = 0; 120 + pid_t ret = waitpid(child[i], &wstatus[i], WNOHANG); 121 + 122 + if (!ret || (!WIFEXITED(wstatus[i]) && !WIFSIGNALED(wstatus[i]))) 123 + continue; 124 + 125 + if (ret < 0 && errno != ECHILD) 126 + warn("(pid=%d): waitpid(%d)", pid, child[i]); 127 + 128 + child[i] *= -1; 129 + childs -= 1; 130 + } 131 + 132 + if (!childs) 133 + break; 134 + 135 + usleep(250000); 136 + 137 + for (i = 0; i < NR_CHILDS; i++) { 138 + if (child[i] <= 0) 139 + continue; 140 + kill(child[i], SIGUSR1); 141 + } 142 + } 143 + 144 + for (i = 0; i < NR_CHILDS; i++) { 145 + if (WIFEXITED(wstatus[i])) 146 + warnx("(pid=%d): pid %d exited, status=%d", 147 + pid, -child[i], WEXITSTATUS(wstatus[i])); 148 + else if (WIFSIGNALED(wstatus[i])) 149 + warnx("(pid=%d): pid %d killed by signal %d", 150 + pid, -child[i], WTERMSIG(wstatus[i])); 151 + 152 + if (WIFSIGNALED(wstatus[i]) && WTERMSIG(wstatus[i]) == SIGUSR1) 153 + continue; 154 + 155 + warnx("(pid=%d): Test failed", pid); 156 + exit(EXIT_FAILURE); 157 + } 158 + 159 + warnx("(pid=%d): Test passed", pid); 160 + exit(EXIT_SUCCESS); 161 + }