Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

exec: Add exec_update_mutex to replace cred_guard_mutex

The cred_guard_mutex is problematic as it is held over possibly
indefinite waits for userspace. The possible indefinite waits for
userspace that I have identified are: The cred_guard_mutex is held in
PTRACE_EVENT_EXIT waiting for the tracer. The cred_guard_mutex is
held over "put_user(0, tsk->clear_child_tid)" in exit_mm(). The
cred_guard_mutex is held over "get_user(futex_offset, ...") in
exit_robust_list. The cred_guard_mutex held over copy_strings.

The functions get_user and put_user can trigger a page fault which can
potentially wait indefinitely in the case of userfaultfd or if
userspace implements part of the page fault path.

In any of those cases the userspace process that the kernel is waiting
for might make a different system call that winds up taking the
cred_guard_mutex and result in deadlock.

Holding a mutex over any of those possibly indefinite waits for
userspace does not appear necessary. Add exec_update_mutex that will
just cover updating the process during exec where the permissions and
the objects pointed to by the task struct may be out of sync.

The plan is to switch the users of cred_guard_mutex to
exec_update_mutex one by one. This lets us move forward while still
being careful and not introducing any regressions.

Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

+36 -5
+19 -3
fs/exec.c
··· 1010 1010 } 1011 1011 EXPORT_SYMBOL(read_code); 1012 1012 1013 + /* 1014 + * Maps the mm_struct mm into the current task struct. 1015 + * On success, this function returns with the mutex 1016 + * exec_update_mutex locked. 1017 + */ 1013 1018 static int exec_mmap(struct mm_struct *mm) 1014 1019 { 1015 1020 struct task_struct *tsk; 1016 1021 struct mm_struct *old_mm, *active_mm; 1022 + int ret; 1017 1023 1018 1024 /* Notify parent that we're no longer interested in the old VM */ 1019 1025 tsk = current; 1020 1026 old_mm = current->mm; 1021 1027 exec_mm_release(tsk, old_mm); 1028 + 1029 + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); 1030 + if (ret) 1031 + return ret; 1022 1032 1023 1033 if (old_mm) { 1024 1034 sync_mm_rss(old_mm); ··· 1041 1031 down_read(&old_mm->mmap_sem); 1042 1032 if (unlikely(old_mm->core_state)) { 1043 1033 up_read(&old_mm->mmap_sem); 1034 + mutex_unlock(&tsk->signal->exec_update_mutex); 1044 1035 return -EINTR; 1045 1036 } 1046 1037 } 1038 + 1047 1039 task_lock(tsk); 1048 1040 active_mm = tsk->active_mm; 1049 1041 membarrier_exec_mmap(mm); ··· 1300 1288 goto out; 1301 1289 1302 1290 /* 1303 - * After clearing bprm->mm (to mark that current is using the 1304 - * prepared mm now), we have nothing left of the original 1291 + * After setting bprm->called_exec_mmap (to mark that current is 1292 + * using the prepared mm now), we have nothing left of the original 1305 1293 * process. If anything from here on returns an error, the check 1306 1294 * in search_binary_handler() will SEGV current. 1307 1295 */ 1296 + bprm->called_exec_mmap = 1; 1308 1297 bprm->mm = NULL; 1309 1298 1310 1299 #ifdef CONFIG_POSIX_TIMERS ··· 1451 1438 { 1452 1439 free_arg_pages(bprm); 1453 1440 if (bprm->cred) { 1441 + if (bprm->called_exec_mmap) 1442 + mutex_unlock(&current->signal->exec_update_mutex); 1454 1443 mutex_unlock(&current->signal->cred_guard_mutex); 1455 1444 abort_creds(bprm->cred); 1456 1445 } ··· 1502 1487 * credentials; any time after this it may be unlocked. 1503 1488 */ 1504 1489 security_bprm_committed_creds(bprm); 1490 + mutex_unlock(&current->signal->exec_update_mutex); 1505 1491 mutex_unlock(&current->signal->cred_guard_mutex); 1506 1492 } 1507 1493 EXPORT_SYMBOL(install_exec_creds); ··· 1694 1678 1695 1679 read_lock(&binfmt_lock); 1696 1680 put_binfmt(fmt); 1697 - if (retval < 0 && !bprm->mm) { 1681 + if (retval < 0 && bprm->called_exec_mmap) { 1698 1682 /* we got to flush_old_exec() and failed after it */ 1699 1683 read_unlock(&binfmt_lock); 1700 1684 force_sigsegv(SIGSEGV);
+7 -1
include/linux/binfmts.h
··· 44 44 * exec has happened. Used to sanitize execution environment 45 45 * and to set AT_SECURE auxv for glibc. 46 46 */ 47 - secureexec:1; 47 + secureexec:1, 48 + /* 49 + * Set by flush_old_exec, when exec_mmap has been called. 50 + * This is past the point of no return, when the 51 + * exec_update_mutex has been taken. 52 + */ 53 + called_exec_mmap:1; 48 54 #ifdef __alpha__ 49 55 unsigned int taso:1; 50 56 #endif
+8 -1
include/linux/sched/signal.h
··· 224 224 225 225 struct mutex cred_guard_mutex; /* guard against foreign influences on 226 226 * credential calculations 227 - * (notably. ptrace) */ 227 + * (notably. ptrace) 228 + * Deprecated do not use in new code. 229 + * Use exec_update_mutex instead. 230 + */ 231 + struct mutex exec_update_mutex; /* Held while task_struct is being 232 + * updated during exec, and may have 233 + * inconsistent permissions. 234 + */ 228 235 } __randomize_layout; 229 236 230 237 /*
+1
init/init_task.c
··· 26 26 .multiprocess = HLIST_HEAD_INIT, 27 27 .rlim = INIT_RLIMITS, 28 28 .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), 29 + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), 29 30 #ifdef CONFIG_POSIX_TIMERS 30 31 .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), 31 32 .cputimer = {
+1
kernel/fork.c
··· 1594 1594 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1595 1595 1596 1596 mutex_init(&sig->cred_guard_mutex); 1597 + mutex_init(&sig->exec_update_mutex); 1597 1598 1598 1599 return 0; 1599 1600 }