Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

seccomp: implement SECCOMP_FILTER_FLAG_TSYNC

Applying restrictive seccomp filter programs to large or diverse
codebases often requires handling threads which may be started early in
the process lifetime (e.g., by code that is linked in). While it is
possible to apply permissive programs prior to process start up, it is
difficult to further restrict the kernel ABI to those threads after that
point.

This change adds a new seccomp syscall flag to SECCOMP_SET_MODE_FILTER for
synchronizing thread group seccomp filters at filter installation time.

When calling seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
filter) an attempt will be made to synchronize all threads in current's
threadgroup to its new seccomp filter program. This is possible iff all
threads are using a filter that is an ancestor to the filter current is
attempting to synchronize to. NULL filters (where the task is running as
SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
transitioned into SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS,
...) has been set on the calling thread, no_new_privs will be set for
all synchronized threads too. On success, 0 is returned. On failure,
the pid of one of the failing threads will be returned and no filters
will have been applied.

The race conditions against another thread are:
- requesting TSYNC (already handled by sighand lock)
- performing a clone (already handled by sighand lock)
- changing its filter (already handled by sighand lock)
- calling exec (handled by cred_guard_mutex)
The clone case is assisted by the fact that new threads will have their
seccomp state duplicated from their parent before appearing on the tasklist.

Holding cred_guard_mutex means that seccomp filters cannot be assigned
while in the middle of another thread's exec (potentially bypassing
no_new_privs or similar). The call to de_thread() may kill threads waiting
for the mutex.

Changes across threads to the filter pointer includes a barrier.

Based on patches by Will Drewry.

Suggested-by: Julien Tinnes <jln@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>

Kees Cook c2e1f2e3 3ba2530c

+140 -2
+1 -1
fs/exec.c
··· 1216 1216 /* 1217 1217 * determine how safe it is to execute the proposed program 1218 1218 * - the caller must hold ->cred_guard_mutex to protect against 1219 - * PTRACE_ATTACH 1219 + * PTRACE_ATTACH or seccomp thread-sync 1220 1220 */ 1221 1221 static void check_unsafe_exec(struct linux_binprm *bprm) 1222 1222 {
+2
include/linux/seccomp.h
··· 3 3 4 4 #include <uapi/linux/seccomp.h> 5 5 6 + #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC) 7 + 6 8 #ifdef CONFIG_SECCOMP 7 9 8 10 #include <linux/thread_info.h>
+3
include/uapi/linux/seccomp.h
··· 14 14 #define SECCOMP_SET_MODE_STRICT 0 15 15 #define SECCOMP_SET_MODE_FILTER 1 16 16 17 + /* Valid flags for SECCOMP_SET_MODE_FILTER */ 18 + #define SECCOMP_FILTER_FLAG_TSYNC 1 19 + 17 20 /* 18 21 * All BPF programs must return a 32-bit value. 19 22 * The bottom 16-bits are for optional return data.
+134 -1
kernel/seccomp.c
··· 26 26 #ifdef CONFIG_SECCOMP_FILTER 27 27 #include <asm/syscall.h> 28 28 #include <linux/filter.h> 29 + #include <linux/pid.h> 29 30 #include <linux/ptrace.h> 30 31 #include <linux/security.h> 31 32 #include <linux/tracehook.h> ··· 226 225 } 227 226 228 227 #ifdef CONFIG_SECCOMP_FILTER 228 + /* Returns 1 if the parent is an ancestor of the child. */ 229 + static int is_ancestor(struct seccomp_filter *parent, 230 + struct seccomp_filter *child) 231 + { 232 + /* NULL is the root ancestor. */ 233 + if (parent == NULL) 234 + return 1; 235 + for (; child; child = child->prev) 236 + if (child == parent) 237 + return 1; 238 + return 0; 239 + } 240 + 241 + /** 242 + * seccomp_can_sync_threads: checks if all threads can be synchronized 243 + * 244 + * Expects sighand and cred_guard_mutex locks to be held. 245 + * 246 + * Returns 0 on success, -ve on error, or the pid of a thread which was 247 + * either not in the correct seccomp mode or it did not have an ancestral 248 + * seccomp filter. 249 + */ 250 + static inline pid_t seccomp_can_sync_threads(void) 251 + { 252 + struct task_struct *thread, *caller; 253 + 254 + BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); 255 + BUG_ON(!spin_is_locked(&current->sighand->siglock)); 256 + 257 + /* Validate all threads being eligible for synchronization. */ 258 + caller = current; 259 + for_each_thread(caller, thread) { 260 + pid_t failed; 261 + 262 + /* Skip current, since it is initiating the sync. */ 263 + if (thread == caller) 264 + continue; 265 + 266 + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || 267 + (thread->seccomp.mode == SECCOMP_MODE_FILTER && 268 + is_ancestor(thread->seccomp.filter, 269 + caller->seccomp.filter))) 270 + continue; 271 + 272 + /* Return the first thread that cannot be synchronized. */ 273 + failed = task_pid_vnr(thread); 274 + /* If the pid cannot be resolved, then return -ESRCH */ 275 + if (unlikely(WARN_ON(failed == 0))) 276 + failed = -ESRCH; 277 + return failed; 278 + } 279 + 280 + return 0; 281 + } 282 + 283 + /** 284 + * seccomp_sync_threads: sets all threads to use current's filter 285 + * 286 + * Expects sighand and cred_guard_mutex locks to be held, and for 287 + * seccomp_can_sync_threads() to have returned success already 288 + * without dropping the locks. 289 + * 290 + */ 291 + static inline void seccomp_sync_threads(void) 292 + { 293 + struct task_struct *thread, *caller; 294 + 295 + BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); 296 + BUG_ON(!spin_is_locked(&current->sighand->siglock)); 297 + 298 + /* Synchronize all threads. */ 299 + caller = current; 300 + for_each_thread(caller, thread) { 301 + /* Skip current, since it needs no changes. */ 302 + if (thread == caller) 303 + continue; 304 + 305 + /* Get a task reference for the new leaf node. */ 306 + get_seccomp_filter(caller); 307 + /* 308 + * Drop the task reference to the shared ancestor since 309 + * current's path will hold a reference. (This also 310 + * allows a put before the assignment.) 311 + */ 312 + put_seccomp_filter(thread); 313 + smp_store_release(&thread->seccomp.filter, 314 + caller->seccomp.filter); 315 + /* 316 + * Opt the other thread into seccomp if needed. 317 + * As threads are considered to be trust-realm 318 + * equivalent (see ptrace_may_access), it is safe to 319 + * allow one thread to transition the other. 320 + */ 321 + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { 322 + /* 323 + * Don't let an unprivileged task work around 324 + * the no_new_privs restriction by creating 325 + * a thread that sets it up, enters seccomp, 326 + * then dies. 327 + */ 328 + if (task_no_new_privs(caller)) 329 + task_set_no_new_privs(thread); 330 + 331 + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); 332 + } 333 + } 334 + } 335 + 229 336 /** 230 337 * seccomp_prepare_filter: Prepares a seccomp filter for use. 231 338 * @fprog: BPF program to install ··· 473 364 if (total_insns > MAX_INSNS_PER_PATH) 474 365 return -ENOMEM; 475 366 367 + /* If thread sync has been requested, check that it is possible. */ 368 + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { 369 + int ret; 370 + 371 + ret = seccomp_can_sync_threads(); 372 + if (ret) 373 + return ret; 374 + } 375 + 476 376 /* 477 377 * If there is an existing filter, make it the prev and don't drop its 478 378 * task reference. 479 379 */ 480 380 filter->prev = current->seccomp.filter; 481 381 current->seccomp.filter = filter; 382 + 383 + /* Now that the new filter is in place, synchronize to all threads. */ 384 + if (flags & SECCOMP_FILTER_FLAG_TSYNC) 385 + seccomp_sync_threads(); 482 386 483 387 return 0; 484 388 } ··· 712 590 long ret = -EINVAL; 713 591 714 592 /* Validate flags. */ 715 - if (flags != 0) 593 + if (flags & ~SECCOMP_FILTER_FLAG_MASK) 716 594 return -EINVAL; 717 595 718 596 /* Prepare the new filter before holding any locks. */ 719 597 prepared = seccomp_prepare_user_filter(filter); 720 598 if (IS_ERR(prepared)) 721 599 return PTR_ERR(prepared); 600 + 601 + /* 602 + * Make sure we cannot change seccomp or nnp state via TSYNC 603 + * while another thread is in the middle of calling exec. 604 + */ 605 + if (flags & SECCOMP_FILTER_FLAG_TSYNC && 606 + mutex_lock_killable(&current->signal->cred_guard_mutex)) 607 + goto out_free; 722 608 723 609 spin_lock_irq(&current->sighand->siglock); 724 610 ··· 742 612 seccomp_assign_mode(current, seccomp_mode); 743 613 out: 744 614 spin_unlock_irq(&current->sighand->siglock); 615 + if (flags & SECCOMP_FILTER_FLAG_TSYNC) 616 + mutex_unlock(&current->signal->cred_guard_mutex); 617 + out_free: 745 618 seccomp_filter_free(prepared); 746 619 return ret; 747 620 }