Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull tasklist_lock optimizations from Eric Biederman:
"prlimit and getpriority tasklist_lock optimizations

The tasklist_lock popped up as a scalability bottleneck on some
testing workloads. The readlocks in do_prlimit and set/getpriority are
not necessary in all cases.

Based on a cycles profile, it looked like ~87% of the time was spent
in the kernel, ~42% of which was just trying to get *some* spinlock
(queued_spin_lock_slowpath, not necessarily the tasklist_lock).

The big offenders (with rough percentages in cycles of the overall
trace):
- do_wait 11%
- setpriority 8% (done previously in commit 7f8ca0edfe07)
- kill 8%
- do_exit 5%
- clone 3%
- prlimit64 2% (this patchset)
- getrlimit 1% (this patchset)

I can't easily test this patchset on the original workload for various
reasons. Instead, I used the microbenchmark below to at least verify
there was some improvement. This patchset had a 28% speedup (12% from
baseline to set/getprio, then another 14% for prlimit).

This series used to do the setpriority case, but an almost identical
change was merged as commit 7f8ca0edfe07 ("kernel/sys.c: only take
tasklist_lock for get/setpriority(PRIO_PGRP)") so that has been
dropped from here.

One interesting thing is that my libc's getrlimit() was calling
prlimit64, so hoisting the read_lock(tasklist_lock) into sys_prlimit64
had no effect - it essentially optimized the older syscalls only. I
didn't do that in this patchset, but figured I'd mention it since it
was an option from the previous patch's discussion"

micobenchmark.c:
---------------
int main(int argc, char **argv)
{
pid_t child;
struct rlimit rlim[1];

fork(); fork(); fork(); fork(); fork(); fork();

for (int i = 0; i < 5000; i++) {
child = fork();
if (child < 0)
exit(1);
if (child > 0) {
usleep(1000);
kill(child, SIGTERM);
waitpid(child, NULL, 0);
} else {
for (;;) {
setpriority(PRIO_PROCESS, 0,
getpriority(PRIO_PROCESS, 0));
getrlimit(RLIMIT_CPU, rlim);
}
}
}

return 0;
}

Link: https://lore.kernel.org/lkml/20211213220401.1039578-1-brho@google.com/ [v1]
Link: https://lore.kernel.org/lkml/20220105212828.197013-1-brho@google.com/ [v2]
Link: https://lore.kernel.org/lkml/20220106172041.522167-1-brho@google.com/ [v3]

* tag 'prlimit-tasklist_lock-for-v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
prlimit: do not grab the tasklist_lock
prlimit: make do_prlimit() static

+72 -63
+1 -1
include/linux/posix-timers.h
··· 253 253 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, 254 254 u64 *newval, u64 *oldval); 255 255 256 - void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); 256 + int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); 257 257 258 258 void posixtimer_rearm(struct kernel_siginfo *info); 259 259 #endif
-2
include/linux/resource.h
··· 8 8 struct task_struct; 9 9 10 10 void getrusage(struct task_struct *p, int who, struct rusage *ru); 11 - int do_prlimit(struct task_struct *tsk, unsigned int resource, 12 - struct rlimit *new_rlim, struct rlimit *old_rlim); 13 11 14 12 #endif
+62 -57
kernel/sys.c
··· 1424 1424 return errno; 1425 1425 } 1426 1426 1427 + /* make sure you are allowed to change @tsk limits before calling this */ 1428 + static int do_prlimit(struct task_struct *tsk, unsigned int resource, 1429 + struct rlimit *new_rlim, struct rlimit *old_rlim) 1430 + { 1431 + struct rlimit *rlim; 1432 + int retval = 0; 1433 + 1434 + if (resource >= RLIM_NLIMITS) 1435 + return -EINVAL; 1436 + if (new_rlim) { 1437 + if (new_rlim->rlim_cur > new_rlim->rlim_max) 1438 + return -EINVAL; 1439 + if (resource == RLIMIT_NOFILE && 1440 + new_rlim->rlim_max > sysctl_nr_open) 1441 + return -EPERM; 1442 + } 1443 + 1444 + /* Holding a refcount on tsk protects tsk->signal from disappearing. */ 1445 + rlim = tsk->signal->rlim + resource; 1446 + task_lock(tsk->group_leader); 1447 + if (new_rlim) { 1448 + /* 1449 + * Keep the capable check against init_user_ns until cgroups can 1450 + * contain all limits. 1451 + */ 1452 + if (new_rlim->rlim_max > rlim->rlim_max && 1453 + !capable(CAP_SYS_RESOURCE)) 1454 + retval = -EPERM; 1455 + if (!retval) 1456 + retval = security_task_setrlimit(tsk, resource, new_rlim); 1457 + } 1458 + if (!retval) { 1459 + if (old_rlim) 1460 + *old_rlim = *rlim; 1461 + if (new_rlim) 1462 + *rlim = *new_rlim; 1463 + } 1464 + task_unlock(tsk->group_leader); 1465 + 1466 + /* 1467 + * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not 1468 + * infinite. In case of RLIM_INFINITY the posix CPU timer code 1469 + * ignores the rlimit. 1470 + */ 1471 + if (!retval && new_rlim && resource == RLIMIT_CPU && 1472 + new_rlim->rlim_cur != RLIM_INFINITY && 1473 + IS_ENABLED(CONFIG_POSIX_TIMERS)) { 1474 + /* 1475 + * update_rlimit_cpu can fail if the task is exiting, but there 1476 + * may be other tasks in the thread group that are not exiting, 1477 + * and they need their cpu timers adjusted. 1478 + * 1479 + * The group_leader is the last task to be released, so if we 1480 + * cannot update_rlimit_cpu on it, then the entire process is 1481 + * exiting and we do not need to update at all. 1482 + */ 1483 + update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur); 1484 + } 1485 + 1486 + return retval; 1487 + } 1488 + 1427 1489 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) 1428 1490 { 1429 1491 struct rlimit value; ··· 1627 1565 rlim->rlim_max = RLIM_INFINITY; 1628 1566 else 1629 1567 rlim->rlim_max = (unsigned long)rlim64->rlim_max; 1630 - } 1631 - 1632 - /* make sure you are allowed to change @tsk limits before calling this */ 1633 - int do_prlimit(struct task_struct *tsk, unsigned int resource, 1634 - struct rlimit *new_rlim, struct rlimit *old_rlim) 1635 - { 1636 - struct rlimit *rlim; 1637 - int retval = 0; 1638 - 1639 - if (resource >= RLIM_NLIMITS) 1640 - return -EINVAL; 1641 - if (new_rlim) { 1642 - if (new_rlim->rlim_cur > new_rlim->rlim_max) 1643 - return -EINVAL; 1644 - if (resource == RLIMIT_NOFILE && 1645 - new_rlim->rlim_max > sysctl_nr_open) 1646 - return -EPERM; 1647 - } 1648 - 1649 - /* protect tsk->signal and tsk->sighand from disappearing */ 1650 - read_lock(&tasklist_lock); 1651 - if (!tsk->sighand) { 1652 - retval = -ESRCH; 1653 - goto out; 1654 - } 1655 - 1656 - rlim = tsk->signal->rlim + resource; 1657 - task_lock(tsk->group_leader); 1658 - if (new_rlim) { 1659 - /* Keep the capable check against init_user_ns until 1660 - cgroups can contain all limits */ 1661 - if (new_rlim->rlim_max > rlim->rlim_max && 1662 - !capable(CAP_SYS_RESOURCE)) 1663 - retval = -EPERM; 1664 - if (!retval) 1665 - retval = security_task_setrlimit(tsk, resource, new_rlim); 1666 - } 1667 - if (!retval) { 1668 - if (old_rlim) 1669 - *old_rlim = *rlim; 1670 - if (new_rlim) 1671 - *rlim = *new_rlim; 1672 - } 1673 - task_unlock(tsk->group_leader); 1674 - 1675 - /* 1676 - * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not 1677 - * infinite. In case of RLIM_INFINITY the posix CPU timer code 1678 - * ignores the rlimit. 1679 - */ 1680 - if (!retval && new_rlim && resource == RLIMIT_CPU && 1681 - new_rlim->rlim_cur != RLIM_INFINITY && 1682 - IS_ENABLED(CONFIG_POSIX_TIMERS)) 1683 - update_rlimit_cpu(tsk, new_rlim->rlim_cur); 1684 - out: 1685 - read_unlock(&tasklist_lock); 1686 - return retval; 1687 1568 } 1688 1569 1689 1570 /* rcu lock must be held */
+9 -3
kernel/time/posix-cpu-timers.c
··· 34 34 * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if 35 35 * necessary. Needs siglock protection since other code may update the 36 36 * expiration cache as well. 37 + * 38 + * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and 39 + * we cannot lock_task_sighand. Cannot fail if task is current. 37 40 */ 38 - void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) 41 + int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) 39 42 { 40 43 u64 nsecs = rlim_new * NSEC_PER_SEC; 44 + unsigned long irq_fl; 41 45 42 - spin_lock_irq(&task->sighand->siglock); 46 + if (!lock_task_sighand(task, &irq_fl)) 47 + return -ESRCH; 43 48 set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); 44 - spin_unlock_irq(&task->sighand->siglock); 49 + unlock_task_sighand(task, &irq_fl); 50 + return 0; 45 51 } 46 52 47 53 /*