Merge tag 'seccomp-v5.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

+2 -2

arch/mips/include/asm/seccomp.h

··· 9 9 static const int syscalls_O32[] = { 10 10 __NR_O32_Linux + 3, __NR_O32_Linux + 4, 11 11 __NR_O32_Linux + 1, __NR_O32_Linux + 193, 12 - 0, /* null terminated */ 12 + -1, /* negative terminated */ 13 13 }; 14 14 static const int syscalls_N32[] = { 15 15 __NR_N32_Linux + 0, __NR_N32_Linux + 1, 16 16 __NR_N32_Linux + 58, __NR_N32_Linux + 211, 17 - 0, /* null terminated */ 17 + -1, /* negative terminated */ 18 18 }; 19 19 20 20 if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS))

+61

fs/file.c

··· 18 18 #include <linux/bitops.h> 19 19 #include <linux/spinlock.h> 20 20 #include <linux/rcupdate.h> 21 + #include <net/sock.h> 21 22 22 23 unsigned int sysctl_nr_open __read_mostly = 1024*1024; 23 24 unsigned int sysctl_nr_open_min = BITS_PER_LONG; ··· 614 613 rcu_read_unlock_sched(); 615 614 } 616 615 616 + /* 617 + * This consumes the "file" refcount, so callers should treat it 618 + * as if they had called fput(file). 619 + */ 617 620 void fd_install(unsigned int fd, struct file *file) 618 621 { 619 622 __fd_install(current->files, fd, file); ··· 934 929 out_unlock: 935 930 spin_unlock(&files->file_lock); 936 931 return err; 932 + } 933 + 934 + /** 935 + * __receive_fd() - Install received file into file descriptor table 936 + * 937 + * @fd: fd to install into (if negative, a new fd will be allocated) 938 + * @file: struct file that was received from another process 939 + * @ufd: __user pointer to write new fd number to 940 + * @o_flags: the O_* flags to apply to the new fd entry 941 + * 942 + * Installs a received file into the file descriptor table, with appropriate 943 + * checks and count updates. Optionally writes the fd number to userspace, if 944 + * @ufd is non-NULL. 945 + * 946 + * This helper handles its own reference counting of the incoming 947 + * struct file. 948 + * 949 + * Returns newly install fd or -ve on error. 950 + */ 951 + int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) 952 + { 953 + int new_fd; 954 + int error; 955 + 956 + error = security_file_receive(file); 957 + if (error) 958 + return error; 959 + 960 + if (fd < 0) { 961 + new_fd = get_unused_fd_flags(o_flags); 962 + if (new_fd < 0) 963 + return new_fd; 964 + } else { 965 + new_fd = fd; 966 + } 967 + 968 + if (ufd) { 969 + error = put_user(new_fd, ufd); 970 + if (error) { 971 + if (fd < 0) 972 + put_unused_fd(new_fd); 973 + return error; 974 + } 975 + } 976 + 977 + if (fd < 0) { 978 + fd_install(new_fd, get_file(file)); 979 + } else { 980 + error = replace_fd(new_fd, file, o_flags); 981 + if (error) 982 + return error; 983 + } 984 + 985 + /* Bump the sock usage counts, if any. */ 986 + __receive_sock(file); 987 + return new_fd; 937 988 } 938 989 939 990 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)

+2

fs/proc/array.c

··· 341 341 seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); 342 342 #ifdef CONFIG_SECCOMP 343 343 seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); 344 + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", 345 + atomic_read(&p->seccomp.filter_count)); 344 346 #endif 345 347 seq_puts(m, "\nSpeculation_Store_Bypass:\t"); 346 348 switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {

+1 -1

include/asm-generic/seccomp.h

··· 33 33 static const int mode1_syscalls_32[] = { 34 34 __NR_seccomp_read_32, __NR_seccomp_write_32, 35 35 __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 36 - 0, /* null terminated */ 36 + -1, /* negative terminated */ 37 37 }; 38 38 return mode1_syscalls_32; 39 39 }

+19

include/linux/file.h

··· 9 9 #include <linux/compiler.h> 10 10 #include <linux/types.h> 11 11 #include <linux/posix_types.h> 12 + #include <linux/errno.h> 12 13 13 14 struct file; 14 15 ··· 91 90 extern void put_unused_fd(unsigned int fd); 92 91 93 92 extern void fd_install(unsigned int fd, struct file *file); 93 + 94 + extern int __receive_fd(int fd, struct file *file, int __user *ufd, 95 + unsigned int o_flags); 96 + static inline int receive_fd_user(struct file *file, int __user *ufd, 97 + unsigned int o_flags) 98 + { 99 + if (ufd == NULL) 100 + return -EFAULT; 101 + return __receive_fd(-1, file, ufd, o_flags); 102 + } 103 + static inline int receive_fd(struct file *file, unsigned int o_flags) 104 + { 105 + return __receive_fd(-1, file, NULL, o_flags); 106 + } 107 + static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags) 108 + { 109 + return __receive_fd(fd, file, NULL, o_flags); 110 + } 94 111 95 112 extern void flush_delayed_fput(void); 96 113 extern void __fput_sync(struct file *);

+8 -2

include/linux/seccomp.h

··· 10 10 SECCOMP_FILTER_FLAG_NEW_LISTENER | \ 11 11 SECCOMP_FILTER_FLAG_TSYNC_ESRCH) 12 12 13 + /* sizeof() the first published struct seccomp_notif_addfd */ 14 + #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 15 + #define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 16 + 13 17 #ifdef CONFIG_SECCOMP 14 18 15 19 #include <linux/thread_info.h> 20 + #include <linux/atomic.h> 16 21 #include <asm/seccomp.h> 17 22 18 23 struct seccomp_filter; ··· 34 29 */ 35 30 struct seccomp { 36 31 int mode; 32 + atomic_t filter_count; 37 33 struct seccomp_filter *filter; 38 34 }; 39 35 ··· 88 82 #endif /* CONFIG_SECCOMP */ 89 83 90 84 #ifdef CONFIG_SECCOMP_FILTER 91 - extern void put_seccomp_filter(struct task_struct *tsk); 85 + extern void seccomp_filter_release(struct task_struct *tsk); 92 86 extern void get_seccomp_filter(struct task_struct *tsk); 93 87 #else /* CONFIG_SECCOMP_FILTER */ 94 - static inline void put_seccomp_filter(struct task_struct *tsk) 88 + static inline void seccomp_filter_release(struct task_struct *tsk) 95 89 { 96 90 return; 97 91 }

+4

include/net/sock.h

··· 891 891 { 892 892 return static_branch_unlikely(&memalloc_socks_key); 893 893 } 894 + 895 + void __receive_sock(struct file *file); 894 896 #else 895 897 896 898 static inline int sk_memalloc_socks(void) ··· 900 898 return 0; 901 899 } 902 900 901 + static inline void __receive_sock(struct file *file) 902 + { } 903 903 #endif 904 904 905 905 static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)

+24 -1

include/uapi/linux/seccomp.h

··· 113 113 __u32 flags; 114 114 }; 115 115 116 + /* valid flags for seccomp_notif_addfd */ 117 + #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ 118 + 119 + /** 120 + * struct seccomp_notif_addfd 121 + * @id: The ID of the seccomp notification 122 + * @flags: SECCOMP_ADDFD_FLAG_* 123 + * @srcfd: The local fd number 124 + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. 125 + * @newfd_flags: The O_* flags the remote FD should have applied 126 + */ 127 + struct seccomp_notif_addfd { 128 + __u64 id; 129 + __u32 flags; 130 + __u32 srcfd; 131 + __u32 newfd; 132 + __u32 newfd_flags; 133 + }; 134 + 116 135 #define SECCOMP_IOC_MAGIC '!' 117 136 #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) 118 137 #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) ··· 142 123 #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) 143 124 #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ 144 125 struct seccomp_notif_resp) 145 - #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) 126 + #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) 127 + /* On success, the return value is the remote process's added fd number */ 128 + #define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ 129 + struct seccomp_notif_addfd) 130 + 146 131 #endif /* _UAPI_LINUX_SECCOMP_H */

+3

init/init_task.c

··· 204 204 #ifdef CONFIG_SECURITY 205 205 .security = NULL, 206 206 #endif 207 + #ifdef CONFIG_SECCOMP 208 + .seccomp = { .filter_count = ATOMIC_INIT(0) }, 209 + #endif 207 210 }; 208 211 EXPORT_SYMBOL(init_task); 209 212

+1

kernel/exit.c

··· 217 217 } 218 218 219 219 write_unlock_irq(&tasklist_lock); 220 + seccomp_filter_release(p); 220 221 proc_flush_pid(thread_pid); 221 222 put_pid(thread_pid); 222 223 release_thread(p);

-1

kernel/fork.c

··· 479 479 #endif 480 480 rt_mutex_debug_task_free(tsk); 481 481 ftrace_graph_exit_task(tsk); 482 - put_seccomp_filter(tsk); 483 482 arch_release_task_struct(tsk); 484 483 if (tsk->flags & PF_KTHREAD) 485 484 free_kthread_struct(tsk);

+3 -11

kernel/pid.c

··· 42 42 #include <linux/sched/signal.h> 43 43 #include <linux/sched/task.h> 44 44 #include <linux/idr.h> 45 + #include <net/sock.h> 45 46 46 47 struct pid init_struct_pid = { 47 48 .count = REFCOUNT_INIT(1), ··· 636 635 if (IS_ERR(file)) 637 636 return PTR_ERR(file); 638 637 639 - ret = security_file_receive(file); 640 - if (ret) { 641 - fput(file); 642 - return ret; 643 - } 644 - 645 - ret = get_unused_fd_flags(O_CLOEXEC); 646 - if (ret < 0) 647 - fput(file); 648 - else 649 - fd_install(ret, file); 638 + ret = receive_fd(file, O_CLOEXEC); 639 + fput(file); 650 640 651 641 return ret; 652 642 }

+304 -72

kernel/seccomp.c

··· 13 13 * Mode 2 allows user-defined system call filters in the form 14 14 * of Berkeley Packet Filters/Linux Socket Filters. 15 15 */ 16 + #define pr_fmt(fmt) "seccomp: " fmt 16 17 17 18 #include <linux/refcount.h> 18 19 #include <linux/audit.h> ··· 42 41 #include <linux/tracehook.h> 43 42 #include <linux/uaccess.h> 44 43 #include <linux/anon_inodes.h> 44 + #include <linux/lockdep.h> 45 + 46 + /* 47 + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the 48 + * wrong direction flag in the ioctl number. This is the broken one, 49 + * which the kernel needs to keep supporting until all userspaces stop 50 + * using the wrong command number. 51 + */ 52 + #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) 45 53 46 54 enum notify_state { 47 55 SECCOMP_NOTIFY_INIT, ··· 87 77 long val; 88 78 u32 flags; 89 79 90 - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ 80 + /* 81 + * Signals when this has changed states, such as the listener 82 + * dying, a new seccomp addfd message, or changing to REPLIED 83 + */ 91 84 struct completion ready; 92 85 86 + struct list_head list; 87 + 88 + /* outstanding addfd requests */ 89 + struct list_head addfd; 90 + }; 91 + 92 + /** 93 + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages 94 + * 95 + * @file: A reference to the file to install in the other task 96 + * @fd: The fd number to install it at. If the fd number is -1, it means the 97 + * installing process should allocate the fd as normal. 98 + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC 99 + * is allowed. 100 + * @ret: The return value of the installing process. It is set to the fd num 101 + * upon success (>= 0). 102 + * @completion: Indicates that the installing process has completed fd 103 + * installation, or gone away (either due to successful 104 + * reply, or signal) 105 + * 106 + */ 107 + struct seccomp_kaddfd { 108 + struct file *file; 109 + int fd; 110 + unsigned int flags; 111 + 112 + /* To only be set on reply */ 113 + int ret; 114 + struct completion completion; 93 115 struct list_head list; 94 116 }; 95 117 ··· 136 94 * filter->notify_lock. 137 95 * @next_id: The id of the next request. 138 96 * @notifications: A list of struct seccomp_knotif elements. 139 - * @wqh: A wait queue for poll. 140 97 */ 141 98 struct notification { 142 99 struct semaphore request; 143 100 u64 next_id; 144 101 struct list_head notifications; 145 - wait_queue_head_t wqh; 146 102 }; 147 103 148 104 /** 149 105 * struct seccomp_filter - container for seccomp BPF programs 150 106 * 151 - * @usage: reference count to manage the object lifetime. 152 - * get/put helpers should be used when accessing an instance 153 - * outside of a lifetime-guarded section. In general, this 154 - * is only needed for handling filters shared across tasks. 107 + * @refs: Reference count to manage the object lifetime. 108 + * A filter's reference count is incremented for each directly 109 + * attached task, once for the dependent filter, and if 110 + * requested for the user notifier. When @refs reaches zero, 111 + * the filter can be freed. 112 + * @users: A filter's @users count is incremented for each directly 113 + * attached task (filter installation, fork(), thread_sync), 114 + * and once for the dependent filter (tracked in filter->prev). 115 + * When it reaches zero it indicates that no direct or indirect 116 + * users of that filter exist. No new tasks can get associated with 117 + * this filter after reaching 0. The @users count is always smaller 118 + * or equal to @refs. Hence, reaching 0 for @users does not mean 119 + * the filter can be freed. 155 120 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged 156 121 * @prev: points to a previously installed, or inherited, filter 157 122 * @prog: the BPF program to evaluate 158 123 * @notif: the struct that holds all notification related information 159 124 * @notify_lock: A lock for all notification-related accesses. 125 + * @wqh: A wait queue for poll if a notifier is in use. 160 126 * 161 127 * seccomp_filter objects are organized in a tree linked via the @prev 162 128 * pointer. For any task, it appears to be a singly-linked list starting ··· 174 124 * how namespaces work. 175 125 * 176 126 * seccomp_filter objects should never be modified after being attached 177 - * to a task_struct (other than @usage). 127 + * to a task_struct (other than @refs). 178 128 */ 179 129 struct seccomp_filter { 180 - refcount_t usage; 130 + refcount_t refs; 131 + refcount_t users; 181 132 bool log; 182 133 struct seccomp_filter *prev; 183 134 struct bpf_prog *prog; 184 135 struct notification *notif; 185 136 struct mutex notify_lock; 137 + wait_queue_head_t wqh; 186 138 }; 187 139 188 140 /* Limit any path through the tree to 256KB worth of instructions. */ ··· 418 366 return 0; 419 367 } 420 368 369 + static inline void seccomp_filter_free(struct seccomp_filter *filter) 370 + { 371 + if (filter) { 372 + bpf_prog_destroy(filter->prog); 373 + kfree(filter); 374 + } 375 + } 376 + 377 + static void __seccomp_filter_orphan(struct seccomp_filter *orig) 378 + { 379 + while (orig && refcount_dec_and_test(&orig->users)) { 380 + if (waitqueue_active(&orig->wqh)) 381 + wake_up_poll(&orig->wqh, EPOLLHUP); 382 + orig = orig->prev; 383 + } 384 + } 385 + 386 + static void __put_seccomp_filter(struct seccomp_filter *orig) 387 + { 388 + /* Clean up single-reference branches iteratively. */ 389 + while (orig && refcount_dec_and_test(&orig->refs)) { 390 + struct seccomp_filter *freeme = orig; 391 + orig = orig->prev; 392 + seccomp_filter_free(freeme); 393 + } 394 + } 395 + 396 + static void __seccomp_filter_release(struct seccomp_filter *orig) 397 + { 398 + /* Notify about any unused filters in the task's former filter tree. */ 399 + __seccomp_filter_orphan(orig); 400 + /* Finally drop all references to the task's former tree. */ 401 + __put_seccomp_filter(orig); 402 + } 403 + 404 + /** 405 + * seccomp_filter_release - Detach the task from its filter tree, 406 + * drop its reference count, and notify 407 + * about unused filters 408 + * 409 + * This function should only be called when the task is exiting as 410 + * it detaches it from its filter tree. As such, READ_ONCE() and 411 + * barriers are not needed here, as would normally be needed. 412 + */ 413 + void seccomp_filter_release(struct task_struct *tsk) 414 + { 415 + struct seccomp_filter *orig = tsk->seccomp.filter; 416 + 417 + /* Detach task from its filter tree. */ 418 + tsk->seccomp.filter = NULL; 419 + __seccomp_filter_release(orig); 420 + } 421 + 421 422 /** 422 423 * seccomp_sync_threads: sets all threads to use current's filter 423 424 * ··· 495 390 496 391 /* Get a task reference for the new leaf node. */ 497 392 get_seccomp_filter(caller); 393 + 498 394 /* 499 395 * Drop the task reference to the shared ancestor since 500 396 * current's path will hold a reference. (This also 501 397 * allows a put before the assignment.) 502 398 */ 503 - put_seccomp_filter(thread); 399 + __seccomp_filter_release(thread->seccomp.filter); 400 + 401 + /* Make our new filter tree visible. */ 504 402 smp_store_release(&thread->seccomp.filter, 505 403 caller->seccomp.filter); 404 + atomic_set(&thread->seccomp.filter_count, 405 + atomic_read(&thread->seccomp.filter_count)); 506 406 507 407 /* 508 408 * Don't let an unprivileged task work around ··· 571 461 return ERR_PTR(ret); 572 462 } 573 463 574 - refcount_set(&sfilter->usage, 1); 464 + refcount_set(&sfilter->refs, 1); 465 + refcount_set(&sfilter->users, 1); 466 + init_waitqueue_head(&sfilter->wqh); 575 467 576 468 return sfilter; 577 469 } ··· 656 544 */ 657 545 filter->prev = current->seccomp.filter; 658 546 current->seccomp.filter = filter; 547 + atomic_inc(&current->seccomp.filter_count); 659 548 660 549 /* Now that the new filter is in place, synchronize to all threads. */ 661 550 if (flags & SECCOMP_FILTER_FLAG_TSYNC) ··· 667 554 668 555 static void __get_seccomp_filter(struct seccomp_filter *filter) 669 556 { 670 - refcount_inc(&filter->usage); 557 + refcount_inc(&filter->refs); 671 558 } 672 559 673 560 /* get_seccomp_filter - increments the reference count of the filter on @tsk */ ··· 677 564 if (!orig) 678 565 return; 679 566 __get_seccomp_filter(orig); 680 - } 681 - 682 - static inline void seccomp_filter_free(struct seccomp_filter *filter) 683 - { 684 - if (filter) { 685 - bpf_prog_destroy(filter->prog); 686 - kfree(filter); 687 - } 688 - } 689 - 690 - static void __put_seccomp_filter(struct seccomp_filter *orig) 691 - { 692 - /* Clean up single-reference branches iteratively. */ 693 - while (orig && refcount_dec_and_test(&orig->usage)) { 694 - struct seccomp_filter *freeme = orig; 695 - orig = orig->prev; 696 - seccomp_filter_free(freeme); 697 - } 698 - } 699 - 700 - /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 701 - void put_seccomp_filter(struct task_struct *tsk) 702 - { 703 - __put_seccomp_filter(tsk->seccomp.filter); 567 + refcount_inc(&orig->users); 704 568 } 705 569 706 570 static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) ··· 774 684 */ 775 685 static const int mode1_syscalls[] = { 776 686 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 777 - 0, /* null terminated */ 687 + -1, /* negative terminated */ 778 688 }; 779 689 780 690 static void __secure_computing_strict(int this_syscall) 781 691 { 782 - const int *syscall_whitelist = mode1_syscalls; 692 + const int *allowed_syscalls = mode1_syscalls; 783 693 #ifdef CONFIG_COMPAT 784 694 if (in_compat_syscall()) 785 - syscall_whitelist = get_compat_mode1_syscalls(); 695 + allowed_syscalls = get_compat_mode1_syscalls(); 786 696 #endif 787 697 do { 788 - if (*syscall_whitelist == this_syscall) 698 + if (*allowed_syscalls == this_syscall) 789 699 return; 790 - } while (*++syscall_whitelist); 700 + } while (*++allowed_syscalls != -1); 791 701 792 702 #ifdef SECCOMP_DEBUG 793 703 dump_stack(); ··· 825 735 return filter->notif->next_id++; 826 736 } 827 737 738 + static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) 739 + { 740 + /* 741 + * Remove the notification, and reset the list pointers, indicating 742 + * that it has been handled. 743 + */ 744 + list_del_init(&addfd->list); 745 + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); 746 + complete(&addfd->completion); 747 + } 748 + 828 749 static int seccomp_do_user_notification(int this_syscall, 829 750 struct seccomp_filter *match, 830 751 const struct seccomp_data *sd) ··· 844 743 u32 flags = 0; 845 744 long ret = 0; 846 745 struct seccomp_knotif n = {}; 746 + struct seccomp_kaddfd *addfd, *tmp; 847 747 848 748 mutex_lock(&match->notify_lock); 849 749 err = -ENOSYS; ··· 857 755 n.id = seccomp_next_notify_id(match); 858 756 init_completion(&n.ready); 859 757 list_add(&n.list, &match->notif->notifications); 758 + INIT_LIST_HEAD(&n.addfd); 860 759 861 760 up(&match->notif->request); 862 - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); 761 + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); 863 762 mutex_unlock(&match->notify_lock); 864 763 865 764 /* 866 765 * This is where we wait for a reply from userspace. 867 766 */ 767 + wait: 868 768 err = wait_for_completion_interruptible(&n.ready); 869 769 mutex_lock(&match->notify_lock); 870 770 if (err == 0) { 771 + /* Check if we were woken up by a addfd message */ 772 + addfd = list_first_entry_or_null(&n.addfd, 773 + struct seccomp_kaddfd, list); 774 + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { 775 + seccomp_handle_addfd(addfd); 776 + mutex_unlock(&match->notify_lock); 777 + goto wait; 778 + } 871 779 ret = n.val; 872 780 err = n.error; 873 781 flags = n.flags; 874 782 } 875 783 784 + /* If there were any pending addfd calls, clear them out */ 785 + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { 786 + /* The process went away before we got a chance to handle it */ 787 + addfd->ret = -ESRCH; 788 + list_del_init(&addfd->list); 789 + complete(&addfd->completion); 790 + } 791 + 876 792 /* 877 793 * Note that it's possible the listener died in between the time when 878 - * we were notified of a respons (or a signal) and when we were able to 794 + * we were notified of a response (or a signal) and when we were able to 879 795 * re-acquire the lock, so only delete from the list if the 880 796 * notification actually exists. 881 797 * ··· 1131 1011 knotif->error = -ENOSYS; 1132 1012 knotif->val = 0; 1133 1013 1014 + /* 1015 + * We do not need to wake up any pending addfd messages, as 1016 + * the notifier will do that for us, as this just looks 1017 + * like a standard reply. 1018 + */ 1134 1019 complete(&knotif->ready); 1135 1020 } 1136 1021 ··· 1145 1020 __put_seccomp_filter(filter); 1146 1021 return 0; 1147 1022 } 1023 + 1024 + /* must be called with notif_lock held */ 1025 + static inline struct seccomp_knotif * 1026 + find_notification(struct seccomp_filter *filter, u64 id) 1027 + { 1028 + struct seccomp_knotif *cur; 1029 + 1030 + lockdep_assert_held(&filter->notify_lock); 1031 + 1032 + list_for_each_entry(cur, &filter->notif->notifications, list) { 1033 + if (cur->id == id) 1034 + return cur; 1035 + } 1036 + 1037 + return NULL; 1038 + } 1039 + 1148 1040 1149 1041 static long seccomp_notify_recv(struct seccomp_filter *filter, 1150 1042 void __user *buf) ··· 1206 1064 unotif.data = *(knotif->data); 1207 1065 1208 1066 knotif->state = SECCOMP_NOTIFY_SENT; 1209 - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); 1067 + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); 1210 1068 ret = 0; 1211 1069 out: 1212 1070 mutex_unlock(&filter->notify_lock); ··· 1220 1078 * may have died when we released the lock, so we need to make 1221 1079 * sure it's still around. 1222 1080 */ 1223 - knotif = NULL; 1224 1081 mutex_lock(&filter->notify_lock); 1225 - list_for_each_entry(cur, &filter->notif->notifications, list) { 1226 - if (cur->id == unotif.id) { 1227 - knotif = cur; 1228 - break; 1229 - } 1230 - } 1231 - 1082 + knotif = find_notification(filter, unotif.id); 1232 1083 if (knotif) { 1233 1084 knotif->state = SECCOMP_NOTIFY_INIT; 1234 1085 up(&filter->notif->request); ··· 1236 1101 void __user *buf) 1237 1102 { 1238 1103 struct seccomp_notif_resp resp = {}; 1239 - struct seccomp_knotif *knotif = NULL, *cur; 1104 + struct seccomp_knotif *knotif; 1240 1105 long ret; 1241 1106 1242 1107 if (copy_from_user(&resp, buf, sizeof(resp))) ··· 1253 1118 if (ret < 0) 1254 1119 return ret; 1255 1120 1256 - list_for_each_entry(cur, &filter->notif->notifications, list) { 1257 - if (cur->id == resp.id) { 1258 - knotif = cur; 1259 - break; 1260 - } 1261 - } 1262 - 1121 + knotif = find_notification(filter, resp.id); 1263 1122 if (!knotif) { 1264 1123 ret = -ENOENT; 1265 1124 goto out; ··· 1279 1150 static long seccomp_notify_id_valid(struct seccomp_filter *filter, 1280 1151 void __user *buf) 1281 1152 { 1282 - struct seccomp_knotif *knotif = NULL; 1153 + struct seccomp_knotif *knotif; 1283 1154 u64 id; 1284 1155 long ret; 1285 1156 ··· 1290 1161 if (ret < 0) 1291 1162 return ret; 1292 1163 1293 - ret = -ENOENT; 1294 - list_for_each_entry(knotif, &filter->notif->notifications, list) { 1295 - if (knotif->id == id) { 1296 - if (knotif->state == SECCOMP_NOTIFY_SENT) 1297 - ret = 0; 1298 - goto out; 1299 - } 1164 + knotif = find_notification(filter, id); 1165 + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) 1166 + ret = 0; 1167 + else 1168 + ret = -ENOENT; 1169 + 1170 + mutex_unlock(&filter->notify_lock); 1171 + return ret; 1172 + } 1173 + 1174 + static long seccomp_notify_addfd(struct seccomp_filter *filter, 1175 + struct seccomp_notif_addfd __user *uaddfd, 1176 + unsigned int size) 1177 + { 1178 + struct seccomp_notif_addfd addfd; 1179 + struct seccomp_knotif *knotif; 1180 + struct seccomp_kaddfd kaddfd; 1181 + int ret; 1182 + 1183 + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); 1184 + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); 1185 + 1186 + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) 1187 + return -EINVAL; 1188 + 1189 + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); 1190 + if (ret) 1191 + return ret; 1192 + 1193 + if (addfd.newfd_flags & ~O_CLOEXEC) 1194 + return -EINVAL; 1195 + 1196 + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) 1197 + return -EINVAL; 1198 + 1199 + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) 1200 + return -EINVAL; 1201 + 1202 + kaddfd.file = fget(addfd.srcfd); 1203 + if (!kaddfd.file) 1204 + return -EBADF; 1205 + 1206 + kaddfd.flags = addfd.newfd_flags; 1207 + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? 1208 + addfd.newfd : -1; 1209 + init_completion(&kaddfd.completion); 1210 + 1211 + ret = mutex_lock_interruptible(&filter->notify_lock); 1212 + if (ret < 0) 1213 + goto out; 1214 + 1215 + knotif = find_notification(filter, addfd.id); 1216 + if (!knotif) { 1217 + ret = -ENOENT; 1218 + goto out_unlock; 1300 1219 } 1301 1220 1302 - out: 1221 + /* 1222 + * We do not want to allow for FD injection to occur before the 1223 + * notification has been picked up by a userspace handler, or after 1224 + * the notification has been replied to. 1225 + */ 1226 + if (knotif->state != SECCOMP_NOTIFY_SENT) { 1227 + ret = -EINPROGRESS; 1228 + goto out_unlock; 1229 + } 1230 + 1231 + list_add(&kaddfd.list, &knotif->addfd); 1232 + complete(&knotif->ready); 1303 1233 mutex_unlock(&filter->notify_lock); 1234 + 1235 + /* Now we wait for it to be processed or be interrupted */ 1236 + ret = wait_for_completion_interruptible(&kaddfd.completion); 1237 + if (ret == 0) { 1238 + /* 1239 + * We had a successful completion. The other side has already 1240 + * removed us from the addfd queue, and 1241 + * wait_for_completion_interruptible has a memory barrier upon 1242 + * success that lets us read this value directly without 1243 + * locking. 1244 + */ 1245 + ret = kaddfd.ret; 1246 + goto out; 1247 + } 1248 + 1249 + mutex_lock(&filter->notify_lock); 1250 + /* 1251 + * Even though we were woken up by a signal and not a successful 1252 + * completion, a completion may have happened in the mean time. 1253 + * 1254 + * We need to check again if the addfd request has been handled, 1255 + * and if not, we will remove it from the queue. 1256 + */ 1257 + if (list_empty(&kaddfd.list)) 1258 + ret = kaddfd.ret; 1259 + else 1260 + list_del(&kaddfd.list); 1261 + 1262 + out_unlock: 1263 + mutex_unlock(&filter->notify_lock); 1264 + out: 1265 + fput(kaddfd.file); 1266 + 1304 1267 return ret; 1305 1268 } 1306 1269 ··· 1402 1181 struct seccomp_filter *filter = file->private_data; 1403 1182 void __user *buf = (void __user *)arg; 1404 1183 1184 + /* Fixed-size ioctls */ 1405 1185 switch (cmd) { 1406 1186 case SECCOMP_IOCTL_NOTIF_RECV: 1407 1187 return seccomp_notify_recv(filter, buf); 1408 1188 case SECCOMP_IOCTL_NOTIF_SEND: 1409 1189 return seccomp_notify_send(filter, buf); 1190 + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: 1410 1191 case SECCOMP_IOCTL_NOTIF_ID_VALID: 1411 1192 return seccomp_notify_id_valid(filter, buf); 1193 + } 1194 + 1195 + /* Extensible Argument ioctls */ 1196 + #define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) 1197 + switch (EA_IOCTL(cmd)) { 1198 + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): 1199 + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); 1412 1200 default: 1413 1201 return -EINVAL; 1414 1202 } ··· 1430 1200 __poll_t ret = 0; 1431 1201 struct seccomp_knotif *cur; 1432 1202 1433 - poll_wait(file, &filter->notif->wqh, poll_tab); 1203 + poll_wait(file, &filter->wqh, poll_tab); 1434 1204 1435 1205 if (mutex_lock_interruptible(&filter->notify_lock) < 0) 1436 1206 return EPOLLERR; ··· 1445 1215 } 1446 1216 1447 1217 mutex_unlock(&filter->notify_lock); 1218 + 1219 + if (refcount_read(&filter->users) == 0) 1220 + ret |= EPOLLHUP; 1448 1221 1449 1222 return ret; 1450 1223 } ··· 1477 1244 sema_init(&filter->notif->request, 0); 1478 1245 filter->notif->next_id = get_random_u64(); 1479 1246 INIT_LIST_HEAD(&filter->notif->notifications); 1480 - init_waitqueue_head(&filter->notif->wqh); 1481 1247 1482 1248 ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, 1483 1249 filter, O_RDWR); ··· 2054 1822 2055 1823 hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); 2056 1824 if (!hdr) 2057 - pr_warn("seccomp: sysctl registration failed\n"); 1825 + pr_warn("sysctl registration failed\n"); 2058 1826 else 2059 1827 kmemleak_not_leak(hdr); 2060 1828

+25 -30

net/compat.c

··· 281 281 return 0; 282 282 } 283 283 284 - void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) 284 + static int scm_max_fds_compat(struct msghdr *msg) 285 285 { 286 - struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; 287 - int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); 288 - int fdnum = scm->fp->count; 289 - struct file **fp = scm->fp->fp; 290 - int __user *cmfptr; 286 + if (msg->msg_controllen <= sizeof(struct compat_cmsghdr)) 287 + return 0; 288 + return (msg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); 289 + } 290 + 291 + void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) 292 + { 293 + struct compat_cmsghdr __user *cm = 294 + (struct compat_cmsghdr __user *)msg->msg_control; 295 + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; 296 + int fdmax = min_t(int, scm_max_fds_compat(msg), scm->fp->count); 297 + int __user *cmsg_data = CMSG_USER_DATA(cm); 291 298 int err = 0, i; 292 299 293 - if (fdnum < fdmax) 294 - fdmax = fdnum; 295 - 296 - for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) { 297 - int new_fd; 298 - err = security_file_receive(fp[i]); 299 - if (err) 300 - break; 301 - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags 302 - ? O_CLOEXEC : 0); 300 + for (i = 0; i < fdmax; i++) { 301 + err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); 303 302 if (err < 0) 304 303 break; 305 - new_fd = err; 306 - err = put_user(new_fd, cmfptr); 307 - if (err) { 308 - put_unused_fd(new_fd); 309 - break; 310 - } 311 - /* Bump the usage count and install the file. */ 312 - fd_install(new_fd, get_file(fp[i])); 313 304 } 314 305 315 306 if (i > 0) { 316 307 int cmlen = CMSG_COMPAT_LEN(i * sizeof(int)); 308 + 317 309 err = put_user(SOL_SOCKET, &cm->cmsg_level); 318 310 if (!err) 319 311 err = put_user(SCM_RIGHTS, &cm->cmsg_type); ··· 313 321 err = put_user(cmlen, &cm->cmsg_len); 314 322 if (!err) { 315 323 cmlen = CMSG_COMPAT_SPACE(i * sizeof(int)); 316 - kmsg->msg_control += cmlen; 317 - kmsg->msg_controllen -= cmlen; 324 + if (msg->msg_controllen < cmlen) 325 + cmlen = msg->msg_controllen; 326 + msg->msg_control += cmlen; 327 + msg->msg_controllen -= cmlen; 318 328 } 319 329 } 320 - if (i < fdnum) 321 - kmsg->msg_flags |= MSG_CTRUNC; 330 + 331 + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) 332 + msg->msg_flags |= MSG_CTRUNC; 322 333 323 334 /* 324 - * All of the files that fit in the message have had their 325 - * usage counts incremented, so we just free the list. 335 + * All of the files that fit in the message have had their usage counts 336 + * incremented, so we just free the list. 326 337 */ 327 338 __scm_destroy(scm); 328 339 }

+10 -40

net/core/scm.c

··· 280 280 } 281 281 EXPORT_SYMBOL(put_cmsg_scm_timestamping); 282 282 283 - static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) 284 - { 285 - struct socket *sock; 286 - int new_fd; 287 - int error; 288 - 289 - error = security_file_receive(file); 290 - if (error) 291 - return error; 292 - 293 - new_fd = get_unused_fd_flags(o_flags); 294 - if (new_fd < 0) 295 - return new_fd; 296 - 297 - error = put_user(new_fd, ufd); 298 - if (error) { 299 - put_unused_fd(new_fd); 300 - return error; 301 - } 302 - 303 - /* Bump the usage count and install the file. */ 304 - sock = sock_from_file(file, &error); 305 - if (sock) { 306 - sock_update_netprioidx(&sock->sk->sk_cgrp_data); 307 - sock_update_classid(&sock->sk->sk_cgrp_data); 308 - } 309 - fd_install(new_fd, get_file(file)); 310 - return 0; 311 - } 312 - 313 283 static int scm_max_fds(struct msghdr *msg) 314 284 { 315 285 if (msg->msg_controllen <= sizeof(struct cmsghdr)) ··· 289 319 290 320 void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) 291 321 { 292 - struct cmsghdr __user *cm 293 - = (__force struct cmsghdr __user*)msg->msg_control; 294 - int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; 322 + struct cmsghdr __user *cm = 323 + (__force struct cmsghdr __user *)msg->msg_control; 324 + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; 295 325 int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); 296 326 int __user *cmsg_data = CMSG_USER_DATA(cm); 297 327 int err = 0, i; 328 + 329 + /* no use for FD passing from kernel space callers */ 330 + if (WARN_ON_ONCE(!msg->msg_control_is_user)) 331 + return; 298 332 299 333 if (msg->msg_flags & MSG_CMSG_COMPAT) { 300 334 scm_detach_fds_compat(msg, scm); 301 335 return; 302 336 } 303 337 304 - /* no use for FD passing from kernel space callers */ 305 - if (WARN_ON_ONCE(!msg->msg_control_is_user)) 306 - return; 307 - 308 338 for (i = 0; i < fdmax; i++) { 309 - err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); 310 - if (err) 339 + err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); 340 + if (err < 0) 311 341 break; 312 342 } 313 343 314 - if (i > 0) { 344 + if (i > 0) { 315 345 int cmlen = CMSG_LEN(i * sizeof(int)); 316 346 317 347 err = put_user(SOL_SOCKET, &cm->cmsg_level);

+21

net/core/sock.c

··· 2842 2842 } 2843 2843 EXPORT_SYMBOL(sock_no_mmap); 2844 2844 2845 + /* 2846 + * When a file is received (via SCM_RIGHTS, etc), we must bump the 2847 + * various sock-based usage counts. 2848 + */ 2849 + void __receive_sock(struct file *file) 2850 + { 2851 + struct socket *sock; 2852 + int error; 2853 + 2854 + /* 2855 + * The resulting value of "error" is ignored here since we only 2856 + * need to take action when the file is a socket and testing 2857 + * "sock" for NULL is sufficient. 2858 + */ 2859 + sock = sock_from_file(file, &error); 2860 + if (sock) { 2861 + sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2862 + sock_update_classid(&sock->sk->sk_cgrp_data); 2863 + } 2864 + } 2865 + 2845 2866 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2846 2867 { 2847 2868 ssize_t res;

+8 -7

tools/testing/selftests/kselftest_harness.h

··· 195 195 * 196 196 * .. code-block:: c 197 197 * 198 - * FIXTURE_DATA(datatype name) 198 + * FIXTURE_DATA(datatype_name) 199 199 * 200 + * Almost always, you want just FIXTURE() instead (see below). 200 201 * This call may be used when the type of the fixture data 201 202 * is needed. In general, this should not be needed unless 202 203 * the *self* is being passed to a helper directly. ··· 212 211 * 213 212 * .. code-block:: c 214 213 * 215 - * FIXTURE(datatype name) { 214 + * FIXTURE(fixture_name) { 216 215 * type property1; 217 216 * ... 218 217 * }; ··· 239 238 * 240 239 * .. code-block:: c 241 240 * 242 - * FIXTURE_SETUP(fixture name) { implementation } 241 + * FIXTURE_SETUP(fixture_name) { implementation } 243 242 * 244 243 * Populates the required "setup" function for a fixture. An instance of the 245 244 * datatype defined with FIXTURE_DATA() will be exposed as *self* for the ··· 265 264 * 266 265 * .. code-block:: c 267 266 * 268 - * FIXTURE_TEARDOWN(fixture name) { implementation } 267 + * FIXTURE_TEARDOWN(fixture_name) { implementation } 269 268 * 270 269 * Populates the required "teardown" function for a fixture. An instance of the 271 270 * datatype defined with FIXTURE_DATA() will be exposed as *self* for the ··· 286 285 * 287 286 * .. code-block:: c 288 287 * 289 - * FIXTURE_VARIANT(datatype name) { 288 + * FIXTURE_VARIANT(fixture_name) { 290 289 * type property1; 291 290 * ... 292 291 * }; ··· 306 305 * 307 306 * .. code-block:: c 308 307 * 309 - * FIXTURE_ADD(datatype name) { 310 - * .property1 = val1; 308 + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) { 309 + * .property1 = val1, 311 310 * ... 312 311 * }; 313 312 *

+1

tools/testing/selftests/seccomp/config

··· 1 1 CONFIG_SECCOMP=y 2 2 CONFIG_SECCOMP_FILTER=y 3 + CONFIG_USER_NS=y

+57 -21

tools/testing/selftests/seccomp/seccomp_benchmark.c

··· 18 18 19 19 unsigned long long timing(clockid_t clk_id, unsigned long long samples) 20 20 { 21 - pid_t pid, ret; 22 - unsigned long long i; 23 21 struct timespec start, finish; 22 + unsigned long long i; 23 + pid_t pid, ret; 24 24 25 25 pid = getpid(); 26 26 assert(clock_gettime(clk_id, &start) == 0); ··· 31 31 assert(clock_gettime(clk_id, &finish) == 0); 32 32 33 33 i = finish.tv_sec - start.tv_sec; 34 - i *= 1000000000; 34 + i *= 1000000000ULL; 35 35 i += finish.tv_nsec - start.tv_nsec; 36 36 37 - printf("%lu.%09lu - %lu.%09lu = %llu\n", 37 + printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n", 38 38 finish.tv_sec, finish.tv_nsec, 39 39 start.tv_sec, start.tv_nsec, 40 - i); 40 + i, (double)i / 1000000000.0); 41 41 42 42 return i; 43 43 } 44 44 45 45 unsigned long long calibrate(void) 46 46 { 47 - unsigned long long i; 47 + struct timespec start, finish; 48 + unsigned long long i, samples, step = 9973; 49 + pid_t pid, ret; 50 + int seconds = 15; 48 51 49 - printf("Calibrating reasonable sample size...\n"); 52 + printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds); 50 53 51 - for (i = 5; ; i++) { 52 - unsigned long long samples = 1 << i; 54 + samples = 0; 55 + pid = getpid(); 56 + assert(clock_gettime(CLOCK_MONOTONIC, &start) == 0); 57 + do { 58 + for (i = 0; i < step; i++) { 59 + ret = syscall(__NR_getpid); 60 + assert(pid == ret); 61 + } 62 + assert(clock_gettime(CLOCK_MONOTONIC, &finish) == 0); 53 63 54 - /* Find something that takes more than 5 seconds to run. */ 55 - if (timing(CLOCK_REALTIME, samples) / 1000000000ULL > 5) 56 - return samples; 57 - } 64 + samples += step; 65 + i = finish.tv_sec - start.tv_sec; 66 + i *= 1000000000ULL; 67 + i += finish.tv_nsec - start.tv_nsec; 68 + } while (i < 1000000000ULL); 69 + 70 + return samples * seconds; 58 71 } 59 72 60 73 int main(int argc, char *argv[]) ··· 81 68 }; 82 69 long ret; 83 70 unsigned long long samples; 84 - unsigned long long native, filtered; 71 + unsigned long long native, filter1, filter2; 72 + 73 + printf("Current BPF sysctl settings:\n"); 74 + system("sysctl net.core.bpf_jit_enable"); 75 + system("sysctl net.core.bpf_jit_harden"); 85 76 86 77 if (argc > 1) 87 78 samples = strtoull(argv[1], NULL, 0); 88 79 else 89 80 samples = calibrate(); 90 81 91 - printf("Benchmarking %llu samples...\n", samples); 82 + printf("Benchmarking %llu syscalls...\n", samples); 92 83 84 + /* Native call */ 93 85 native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 94 86 printf("getpid native: %llu ns\n", native); 95 87 96 88 ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 97 89 assert(ret == 0); 98 90 91 + /* One filter */ 99 92 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); 100 93 assert(ret == 0); 101 94 102 - filtered = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 103 - printf("getpid RET_ALLOW: %llu ns\n", filtered); 95 + filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 96 + printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); 104 97 105 - printf("Estimated seccomp overhead per syscall: %llu ns\n", 106 - filtered - native); 98 + if (filter1 == native) 99 + printf("No overhead measured!? Try running again with more samples.\n"); 107 100 108 - if (filtered == native) 109 - printf("Trying running again with more samples.\n"); 101 + /* Two filters */ 102 + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); 103 + assert(ret == 0); 104 + 105 + filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 106 + printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); 107 + 108 + /* Calculations */ 109 + printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", 110 + filter1 - native); 111 + 112 + printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", 113 + filter2 - native); 114 + 115 + printf("Estimated seccomp per-filter overhead: %llu ns\n", 116 + filter2 - filter1); 117 + 118 + printf("Estimated seccomp entry overhead: %llu ns\n", 119 + filter1 - native - (filter2 - filter1)); 110 120 111 121 return 0; 112 122 }

+505 -203

tools/testing/selftests/seccomp/seccomp_bpf.c

··· 45 45 #include <sys/socket.h> 46 46 #include <sys/ioctl.h> 47 47 #include <linux/kcmp.h> 48 + #include <sys/resource.h> 48 49 49 50 #include <unistd.h> 50 51 #include <sys/syscall.h> 51 52 #include <poll.h> 52 53 53 54 #include "../kselftest_harness.h" 55 + #include "../clone3/clone3_selftests.h" 56 + 57 + /* Attempt to de-conflict with the selftests tree. */ 58 + #ifndef SKIP 59 + #define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) 60 + #endif 54 61 55 62 #ifndef PR_SET_PTRACER 56 63 # define PR_SET_PTRACER 0x59616d61 ··· 174 167 175 168 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER 176 169 #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) 170 + #endif 177 171 172 + #ifndef SECCOMP_RET_USER_NOTIF 178 173 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U 179 174 180 175 #define SECCOMP_IOC_MAGIC '!' ··· 189 180 #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) 190 181 #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ 191 182 struct seccomp_notif_resp) 192 - #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) 183 + #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) 193 184 194 185 struct seccomp_notif { 195 186 __u64 id; ··· 211 202 __u16 seccomp_data; 212 203 }; 213 204 #endif 205 + 206 + #ifndef SECCOMP_IOCTL_NOTIF_ADDFD 207 + /* On success, the return value is the remote process's added fd number */ 208 + #define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ 209 + struct seccomp_notif_addfd) 210 + 211 + /* valid flags for seccomp_notif_addfd */ 212 + #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ 213 + 214 + struct seccomp_notif_addfd { 215 + __u64 id; 216 + __u32 flags; 217 + __u32 srcfd; 218 + __u32 newfd; 219 + __u32 newfd_flags; 220 + }; 221 + #endif 222 + 223 + struct seccomp_notif_addfd_small { 224 + __u64 id; 225 + char weird[4]; 226 + }; 227 + #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \ 228 + SECCOMP_IOW(3, struct seccomp_notif_addfd_small) 229 + 230 + struct seccomp_notif_addfd_big { 231 + union { 232 + struct seccomp_notif_addfd addfd; 233 + char buf[sizeof(struct seccomp_notif_addfd) + 8]; 234 + }; 235 + }; 236 + #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG \ 237 + SECCOMP_IOWR(3, struct seccomp_notif_addfd_big) 214 238 215 239 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY 216 240 #define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 ··· 277 235 #define SIBLING_EXIT_UNKILLED 0xbadbeef 278 236 #define SIBLING_EXIT_FAILURE 0xbadface 279 237 #define SIBLING_EXIT_NEWPRIVS 0xbadfeed 238 + 239 + static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2) 240 + { 241 + #ifdef __NR_kcmp 242 + errno = 0; 243 + return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2); 244 + #else 245 + errno = ENOSYS; 246 + return -1; 247 + #endif 248 + } 249 + 250 + /* Have TH_LOG report actual location filecmp() is used. */ 251 + #define filecmp(pid1, pid2, fd1, fd2) ({ \ 252 + int _ret; \ 253 + \ 254 + _ret = __filecmp(pid1, pid2, fd1, fd2); \ 255 + if (_ret != 0) { \ 256 + if (_ret < 0 && errno == ENOSYS) { \ 257 + TH_LOG("kcmp() syscall missing (test is less accurate)");\ 258 + _ret = 0; \ 259 + } \ 260 + } \ 261 + _ret; }) 262 + 263 + TEST(kcmp) 264 + { 265 + int ret; 266 + 267 + ret = __filecmp(getpid(), getpid(), 1, 1); 268 + EXPECT_EQ(ret, 0); 269 + if (ret != 0 && errno == ENOSYS) 270 + SKIP(return, "Kernel does not support kcmp() (missing CONFIG_CHECKPOINT_RESTORE?)"); 271 + } 280 272 281 273 TEST(mode_strict_support) 282 274 { ··· 1546 1470 1547 1471 return tracer_pid; 1548 1472 } 1473 + 1549 1474 void teardown_trace_fixture(struct __test_metadata *_metadata, 1550 1475 pid_t tracer) 1551 1476 { ··· 1827 1750 EXPECT_EQ(0, ret); 1828 1751 } 1829 1752 1830 - void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee, 1753 + void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee, 1831 1754 int status, void *args) 1832 1755 { 1833 1756 int ret; ··· 1904 1827 pid_t tracer, mytid, mypid, parent; 1905 1828 }; 1906 1829 1830 + FIXTURE_VARIANT(TRACE_syscall) { 1831 + /* 1832 + * All of the SECCOMP_RET_TRACE behaviors can be tested with either 1833 + * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL. 1834 + * This indicates if we should use SECCOMP_RET_TRACE (false), or 1835 + * ptrace (true). 1836 + */ 1837 + bool use_ptrace; 1838 + }; 1839 + 1840 + FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) { 1841 + .use_ptrace = true, 1842 + }; 1843 + 1844 + FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) { 1845 + .use_ptrace = false, 1846 + }; 1847 + 1907 1848 FIXTURE_SETUP(TRACE_syscall) 1908 1849 { 1909 1850 struct sock_filter filter[] = { ··· 1937 1842 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005), 1938 1843 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), 1939 1844 }; 1940 - 1941 - memset(&self->prog, 0, sizeof(self->prog)); 1942 - self->prog.filter = malloc(sizeof(filter)); 1943 - ASSERT_NE(NULL, self->prog.filter); 1944 - memcpy(self->prog.filter, filter, sizeof(filter)); 1945 - self->prog.len = (unsigned short)ARRAY_SIZE(filter); 1845 + struct sock_fprog prog = { 1846 + .len = (unsigned short)ARRAY_SIZE(filter), 1847 + .filter = filter, 1848 + }; 1849 + long ret; 1946 1850 1947 1851 /* Prepare some testable syscall results. */ 1948 1852 self->mytid = syscall(__NR_gettid); ··· 1959 1865 ASSERT_NE(self->parent, self->mypid); 1960 1866 1961 1867 /* Launch tracer. */ 1962 - self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL, 1963 - false); 1868 + self->tracer = setup_trace_fixture(_metadata, 1869 + variant->use_ptrace ? tracer_ptrace 1870 + : tracer_seccomp, 1871 + NULL, variant->use_ptrace); 1872 + 1873 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 1874 + ASSERT_EQ(0, ret); 1875 + 1876 + if (variant->use_ptrace) 1877 + return; 1878 + 1879 + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); 1880 + ASSERT_EQ(0, ret); 1964 1881 } 1965 1882 1966 1883 FIXTURE_TEARDOWN(TRACE_syscall) 1967 1884 { 1968 1885 teardown_trace_fixture(_metadata, self->tracer); 1969 - if (self->prog.filter) 1970 - free(self->prog.filter); 1971 1886 } 1972 1887 1973 - TEST_F(TRACE_syscall, ptrace_syscall_redirected) 1888 + TEST(negative_ENOSYS) 1974 1889 { 1975 - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ 1976 - teardown_trace_fixture(_metadata, self->tracer); 1977 - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, 1978 - true); 1979 - 1980 - /* Tracer will redirect getpid to getppid. */ 1981 - EXPECT_NE(self->mypid, syscall(__NR_getpid)); 1890 + /* 1891 + * There should be no difference between an "internal" skip 1892 + * and userspace asking for syscall "-1". 1893 + */ 1894 + errno = 0; 1895 + EXPECT_EQ(-1, syscall(-1)); 1896 + EXPECT_EQ(errno, ENOSYS); 1897 + /* And no difference for "still not valid but not -1". */ 1898 + errno = 0; 1899 + EXPECT_EQ(-1, syscall(-101)); 1900 + EXPECT_EQ(errno, ENOSYS); 1982 1901 } 1983 1902 1984 - TEST_F(TRACE_syscall, ptrace_syscall_errno) 1903 + TEST_F(TRACE_syscall, negative_ENOSYS) 1985 1904 { 1986 - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ 1987 - teardown_trace_fixture(_metadata, self->tracer); 1988 - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, 1989 - true); 1990 - 1991 - /* Tracer should skip the open syscall, resulting in ESRCH. */ 1992 - EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat)); 1993 - } 1994 - 1995 - TEST_F(TRACE_syscall, ptrace_syscall_faked) 1996 - { 1997 - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ 1998 - teardown_trace_fixture(_metadata, self->tracer); 1999 - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, 2000 - true); 2001 - 2002 - /* Tracer should skip the gettid syscall, resulting fake pid. */ 2003 - EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid)); 1905 + negative_ENOSYS(_metadata); 2004 1906 } 2005 1907 2006 1908 TEST_F(TRACE_syscall, syscall_allowed) 2007 1909 { 2008 - long ret; 2009 - 2010 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2011 - ASSERT_EQ(0, ret); 2012 - 2013 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2014 - ASSERT_EQ(0, ret); 2015 - 2016 1910 /* getppid works as expected (no changes). */ 2017 1911 EXPECT_EQ(self->parent, syscall(__NR_getppid)); 2018 1912 EXPECT_NE(self->mypid, syscall(__NR_getppid)); ··· 2008 1926 2009 1927 TEST_F(TRACE_syscall, syscall_redirected) 2010 1928 { 2011 - long ret; 2012 - 2013 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2014 - ASSERT_EQ(0, ret); 2015 - 2016 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2017 - ASSERT_EQ(0, ret); 2018 - 2019 1929 /* getpid has been redirected to getppid as expected. */ 2020 1930 EXPECT_EQ(self->parent, syscall(__NR_getpid)); 2021 1931 EXPECT_NE(self->mypid, syscall(__NR_getpid)); ··· 2015 1941 2016 1942 TEST_F(TRACE_syscall, syscall_errno) 2017 1943 { 2018 - long ret; 2019 - 2020 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2021 - ASSERT_EQ(0, ret); 2022 - 2023 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2024 - ASSERT_EQ(0, ret); 2025 - 2026 - /* openat has been skipped and an errno return. */ 1944 + /* Tracer should skip the open syscall, resulting in ESRCH. */ 2027 1945 EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat)); 2028 1946 } 2029 1947 2030 1948 TEST_F(TRACE_syscall, syscall_faked) 2031 1949 { 2032 - long ret; 2033 - 2034 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2035 - ASSERT_EQ(0, ret); 2036 - 2037 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2038 - ASSERT_EQ(0, ret); 2039 - 2040 - /* gettid has been skipped and an altered return value stored. */ 1950 + /* Tracer skips the gettid syscall and store altered return value. */ 2041 1951 EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid)); 2042 1952 } 2043 1953 2044 - TEST_F(TRACE_syscall, skip_after_RET_TRACE) 1954 + TEST_F(TRACE_syscall, skip_after) 2045 1955 { 2046 1956 struct sock_filter filter[] = { 2047 1957 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, ··· 2040 1982 }; 2041 1983 long ret; 2042 1984 2043 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2044 - ASSERT_EQ(0, ret); 2045 - 2046 - /* Install fixture filter. */ 2047 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2048 - ASSERT_EQ(0, ret); 2049 - 2050 - /* Install "errno on getppid" filter. */ 1985 + /* Install additional "errno on getppid" filter. */ 2051 1986 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); 2052 1987 ASSERT_EQ(0, ret); 2053 1988 ··· 2050 1999 EXPECT_EQ(EPERM, errno); 2051 2000 } 2052 2001 2053 - TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS) 2002 + TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS) 2054 2003 { 2055 2004 struct sock_filter filter[] = { 2056 2005 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, ··· 2065 2014 }; 2066 2015 long ret; 2067 2016 2068 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2069 - ASSERT_EQ(0, ret); 2070 - 2071 - /* Install fixture filter. */ 2072 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); 2073 - ASSERT_EQ(0, ret); 2074 - 2075 - /* Install "death on getppid" filter. */ 2076 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); 2077 - ASSERT_EQ(0, ret); 2078 - 2079 - /* Tracer will redirect getpid to getppid, and we should die. */ 2080 - EXPECT_NE(self->mypid, syscall(__NR_getpid)); 2081 - } 2082 - 2083 - TEST_F(TRACE_syscall, skip_after_ptrace) 2084 - { 2085 - struct sock_filter filter[] = { 2086 - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 2087 - offsetof(struct seccomp_data, nr)), 2088 - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), 2089 - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM), 2090 - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), 2091 - }; 2092 - struct sock_fprog prog = { 2093 - .len = (unsigned short)ARRAY_SIZE(filter), 2094 - .filter = filter, 2095 - }; 2096 - long ret; 2097 - 2098 - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ 2099 - teardown_trace_fixture(_metadata, self->tracer); 2100 - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, 2101 - true); 2102 - 2103 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2104 - ASSERT_EQ(0, ret); 2105 - 2106 - /* Install "errno on getppid" filter. */ 2107 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); 2108 - ASSERT_EQ(0, ret); 2109 - 2110 - /* Tracer will redirect getpid to getppid, and we should see EPERM. */ 2111 - EXPECT_EQ(-1, syscall(__NR_getpid)); 2112 - EXPECT_EQ(EPERM, errno); 2113 - } 2114 - 2115 - TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) 2116 - { 2117 - struct sock_filter filter[] = { 2118 - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 2119 - offsetof(struct seccomp_data, nr)), 2120 - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), 2121 - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), 2122 - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), 2123 - }; 2124 - struct sock_fprog prog = { 2125 - .len = (unsigned short)ARRAY_SIZE(filter), 2126 - .filter = filter, 2127 - }; 2128 - long ret; 2129 - 2130 - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ 2131 - teardown_trace_fixture(_metadata, self->tracer); 2132 - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, 2133 - true); 2134 - 2135 - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 2136 - ASSERT_EQ(0, ret); 2137 - 2138 - /* Install "death on getppid" filter. */ 2017 + /* Install additional "death on getppid" filter. */ 2139 2018 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); 2140 2019 ASSERT_EQ(0, ret); 2141 2020 ··· 3050 3069 3051 3070 /* Only real root can get metadata. */ 3052 3071 if (geteuid()) { 3053 - XFAIL(return, "get_metadata requires real root"); 3072 + SKIP(return, "get_metadata requires real root"); 3054 3073 return; 3055 3074 } 3056 3075 ··· 3093 3112 ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md); 3094 3113 EXPECT_EQ(sizeof(md), ret) { 3095 3114 if (errno == EINVAL) 3096 - XFAIL(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); 3115 + SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); 3097 3116 } 3098 3117 3099 3118 EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG); ··· 3109 3128 ASSERT_EQ(0, kill(pid, SIGKILL)); 3110 3129 } 3111 3130 3112 - static int user_trap_syscall(int nr, unsigned int flags) 3131 + static int user_notif_syscall(int nr, unsigned int flags) 3113 3132 { 3114 3133 struct sock_filter filter[] = { 3115 3134 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, ··· 3155 3174 3156 3175 /* Check that we get -ENOSYS with no listener attached */ 3157 3176 if (pid == 0) { 3158 - if (user_trap_syscall(__NR_getppid, 0) < 0) 3177 + if (user_notif_syscall(__NR_getppid, 0) < 0) 3159 3178 exit(1); 3160 3179 ret = syscall(__NR_getppid); 3161 3180 exit(ret >= 0 || errno != ENOSYS); ··· 3172 3191 EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); 3173 3192 3174 3193 /* Check that the basic notification machinery works */ 3175 - listener = user_trap_syscall(__NR_getppid, 3176 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3194 + listener = user_notif_syscall(__NR_getppid, 3195 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3177 3196 ASSERT_GE(listener, 0); 3178 3197 3179 3198 /* Installing a second listener in the chain should EBUSY */ 3180 - EXPECT_EQ(user_trap_syscall(__NR_getppid, 3181 - SECCOMP_FILTER_FLAG_NEW_LISTENER), 3199 + EXPECT_EQ(user_notif_syscall(__NR_getppid, 3200 + SECCOMP_FILTER_FLAG_NEW_LISTENER), 3182 3201 -1); 3183 3202 EXPECT_EQ(errno, EBUSY); 3184 3203 ··· 3239 3258 int ret; 3240 3259 unsigned int flags; 3241 3260 3261 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 3262 + ASSERT_EQ(0, ret) { 3263 + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3264 + } 3265 + 3242 3266 /* these were exclusive */ 3243 3267 flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | 3244 3268 SECCOMP_FILTER_FLAG_TSYNC; 3245 - ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags)); 3269 + ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags)); 3246 3270 ASSERT_EQ(EINVAL, errno); 3247 3271 3248 3272 /* but now they're not */ 3249 3273 flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH; 3250 - ret = user_trap_syscall(__NR_getppid, flags); 3274 + ret = user_notif_syscall(__NR_getppid, flags); 3251 3275 close(ret); 3252 3276 ASSERT_LE(0, ret); 3253 3277 } ··· 3270 3284 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3271 3285 } 3272 3286 3273 - listener = user_trap_syscall(__NR_getppid, 3274 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3287 + listener = user_notif_syscall(__NR_getppid, 3288 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3275 3289 ASSERT_GE(listener, 0); 3276 3290 3277 3291 /* ··· 3324 3338 3325 3339 ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); 3326 3340 3327 - listener = user_trap_syscall(__NR_gettid, 3328 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3341 + listener = user_notif_syscall(__NR_gettid, 3342 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3329 3343 ASSERT_GE(listener, 0); 3330 3344 3331 3345 pid = fork(); ··· 3394 3408 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3395 3409 } 3396 3410 3397 - listener = user_trap_syscall(__NR_getppid, 3398 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3411 + listener = user_notif_syscall(__NR_getppid, 3412 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3399 3413 ASSERT_GE(listener, 0); 3400 3414 3401 3415 /* ··· 3426 3440 struct seccomp_notif req = {}; 3427 3441 struct seccomp_notif_resp resp = {}; 3428 3442 3429 - ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0); 3443 + ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) { 3444 + if (errno == EINVAL) 3445 + SKIP(return, "kernel missing CLONE_NEWUSER support"); 3446 + }; 3430 3447 3431 - listener = user_trap_syscall(__NR_getppid, 3432 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3448 + listener = user_notif_syscall(__NR_getppid, 3449 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3433 3450 ASSERT_GE(listener, 0); 3434 3451 3435 3452 pid = fork(); ··· 3471 3482 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3472 3483 } 3473 3484 3474 - listener = user_trap_syscall(__NR_getppid, 3475 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3485 + listener = user_notif_syscall(__NR_getppid, 3486 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3476 3487 ASSERT_GE(listener, 0); 3477 3488 3478 3489 pid = fork(); ··· 3494 3505 } 3495 3506 3496 3507 /* Create the sibling ns, and sibling in it. */ 3497 - ASSERT_EQ(unshare(CLONE_NEWPID), 0); 3508 + ASSERT_EQ(unshare(CLONE_NEWPID), 0) { 3509 + if (errno == EPERM) 3510 + SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN"); 3511 + } 3498 3512 ASSERT_EQ(errno, 0); 3499 3513 3500 3514 pid2 = fork(); ··· 3539 3547 3540 3548 ASSERT_EQ(unshare(CLONE_NEWUSER), 0); 3541 3549 3542 - listener = user_trap_syscall(__NR_getppid, 3543 - SECCOMP_FILTER_FLAG_NEW_LISTENER); 3550 + listener = user_notif_syscall(__NR_getppid, 3551 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3544 3552 ASSERT_GE(listener, 0); 3545 3553 3546 3554 pid = fork(); ··· 3577 3585 EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); 3578 3586 } 3579 3587 3580 - static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2) 3581 - { 3582 - #ifdef __NR_kcmp 3583 - return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2); 3584 - #else 3585 - errno = ENOSYS; 3586 - return -1; 3587 - #endif 3588 - } 3589 - 3590 3588 TEST(user_notification_continue) 3591 3589 { 3592 3590 pid_t pid; ··· 3591 3609 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3592 3610 } 3593 3611 3594 - listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); 3612 + listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); 3595 3613 ASSERT_GE(listener, 0); 3596 3614 3597 3615 pid = fork(); ··· 3601 3619 int dup_fd, pipe_fds[2]; 3602 3620 pid_t self; 3603 3621 3604 - ret = pipe(pipe_fds); 3605 - if (ret < 0) 3606 - exit(1); 3622 + ASSERT_GE(pipe(pipe_fds), 0); 3607 3623 3608 3624 dup_fd = dup(pipe_fds[0]); 3609 - if (dup_fd < 0) 3610 - exit(1); 3625 + ASSERT_GE(dup_fd, 0); 3626 + EXPECT_NE(pipe_fds[0], dup_fd); 3611 3627 3612 3628 self = getpid(); 3613 - 3614 - ret = filecmp(self, self, pipe_fds[0], dup_fd); 3615 - if (ret) 3616 - exit(2); 3617 - 3629 + ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0); 3618 3630 exit(0); 3619 3631 } 3620 3632 ··· 3649 3673 resp.val = 0; 3650 3674 EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) { 3651 3675 if (errno == EINVAL) 3652 - XFAIL(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE"); 3676 + SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE"); 3653 3677 } 3654 3678 3655 3679 skip: ··· 3657 3681 EXPECT_EQ(true, WIFEXITED(status)); 3658 3682 EXPECT_EQ(0, WEXITSTATUS(status)) { 3659 3683 if (WEXITSTATUS(status) == 2) { 3660 - XFAIL(return, "Kernel does not support kcmp() syscall"); 3684 + SKIP(return, "Kernel does not support kcmp() syscall"); 3661 3685 return; 3662 3686 } 3663 3687 } 3664 3688 } 3665 3689 3690 + TEST(user_notification_filter_empty) 3691 + { 3692 + pid_t pid; 3693 + long ret; 3694 + int status; 3695 + struct pollfd pollfd; 3696 + struct clone_args args = { 3697 + .flags = CLONE_FILES, 3698 + .exit_signal = SIGCHLD, 3699 + }; 3700 + 3701 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 3702 + ASSERT_EQ(0, ret) { 3703 + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3704 + } 3705 + 3706 + pid = sys_clone3(&args, sizeof(args)); 3707 + ASSERT_GE(pid, 0); 3708 + 3709 + if (pid == 0) { 3710 + int listener; 3711 + 3712 + listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER); 3713 + if (listener < 0) 3714 + _exit(EXIT_FAILURE); 3715 + 3716 + if (dup2(listener, 200) != 200) 3717 + _exit(EXIT_FAILURE); 3718 + 3719 + close(listener); 3720 + 3721 + _exit(EXIT_SUCCESS); 3722 + } 3723 + 3724 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 3725 + EXPECT_EQ(true, WIFEXITED(status)); 3726 + EXPECT_EQ(0, WEXITSTATUS(status)); 3727 + 3728 + /* 3729 + * The seccomp filter has become unused so we should be notified once 3730 + * the kernel gets around to cleaning up task struct. 3731 + */ 3732 + pollfd.fd = 200; 3733 + pollfd.events = POLLHUP; 3734 + 3735 + EXPECT_GT(poll(&pollfd, 1, 2000), 0); 3736 + EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); 3737 + } 3738 + 3739 + static void *do_thread(void *data) 3740 + { 3741 + return NULL; 3742 + } 3743 + 3744 + TEST(user_notification_filter_empty_threaded) 3745 + { 3746 + pid_t pid; 3747 + long ret; 3748 + int status; 3749 + struct pollfd pollfd; 3750 + struct clone_args args = { 3751 + .flags = CLONE_FILES, 3752 + .exit_signal = SIGCHLD, 3753 + }; 3754 + 3755 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 3756 + ASSERT_EQ(0, ret) { 3757 + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3758 + } 3759 + 3760 + pid = sys_clone3(&args, sizeof(args)); 3761 + ASSERT_GE(pid, 0); 3762 + 3763 + if (pid == 0) { 3764 + pid_t pid1, pid2; 3765 + int listener, status; 3766 + pthread_t thread; 3767 + 3768 + listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); 3769 + if (listener < 0) 3770 + _exit(EXIT_FAILURE); 3771 + 3772 + if (dup2(listener, 200) != 200) 3773 + _exit(EXIT_FAILURE); 3774 + 3775 + close(listener); 3776 + 3777 + pid1 = fork(); 3778 + if (pid1 < 0) 3779 + _exit(EXIT_FAILURE); 3780 + 3781 + if (pid1 == 0) 3782 + _exit(EXIT_SUCCESS); 3783 + 3784 + pid2 = fork(); 3785 + if (pid2 < 0) 3786 + _exit(EXIT_FAILURE); 3787 + 3788 + if (pid2 == 0) 3789 + _exit(EXIT_SUCCESS); 3790 + 3791 + if (pthread_create(&thread, NULL, do_thread, NULL) || 3792 + pthread_join(thread, NULL)) 3793 + _exit(EXIT_FAILURE); 3794 + 3795 + if (pthread_create(&thread, NULL, do_thread, NULL) || 3796 + pthread_join(thread, NULL)) 3797 + _exit(EXIT_FAILURE); 3798 + 3799 + if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) || 3800 + WEXITSTATUS(status)) 3801 + _exit(EXIT_FAILURE); 3802 + 3803 + if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) || 3804 + WEXITSTATUS(status)) 3805 + _exit(EXIT_FAILURE); 3806 + 3807 + exit(EXIT_SUCCESS); 3808 + } 3809 + 3810 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 3811 + EXPECT_EQ(true, WIFEXITED(status)); 3812 + EXPECT_EQ(0, WEXITSTATUS(status)); 3813 + 3814 + /* 3815 + * The seccomp filter has become unused so we should be notified once 3816 + * the kernel gets around to cleaning up task struct. 3817 + */ 3818 + pollfd.fd = 200; 3819 + pollfd.events = POLLHUP; 3820 + 3821 + EXPECT_GT(poll(&pollfd, 1, 2000), 0); 3822 + EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); 3823 + } 3824 + 3825 + TEST(user_notification_addfd) 3826 + { 3827 + pid_t pid; 3828 + long ret; 3829 + int status, listener, memfd, fd; 3830 + struct seccomp_notif_addfd addfd = {}; 3831 + struct seccomp_notif_addfd_small small = {}; 3832 + struct seccomp_notif_addfd_big big = {}; 3833 + struct seccomp_notif req = {}; 3834 + struct seccomp_notif_resp resp = {}; 3835 + /* 100 ms */ 3836 + struct timespec delay = { .tv_nsec = 100000000 }; 3837 + 3838 + memfd = memfd_create("test", 0); 3839 + ASSERT_GE(memfd, 0); 3840 + 3841 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 3842 + ASSERT_EQ(0, ret) { 3843 + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3844 + } 3845 + 3846 + /* Check that the basic notification machinery works */ 3847 + listener = user_notif_syscall(__NR_getppid, 3848 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3849 + ASSERT_GE(listener, 0); 3850 + 3851 + pid = fork(); 3852 + ASSERT_GE(pid, 0); 3853 + 3854 + if (pid == 0) { 3855 + if (syscall(__NR_getppid) != USER_NOTIF_MAGIC) 3856 + exit(1); 3857 + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); 3858 + } 3859 + 3860 + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); 3861 + 3862 + addfd.srcfd = memfd; 3863 + addfd.newfd = 0; 3864 + addfd.id = req.id; 3865 + addfd.flags = 0x0; 3866 + 3867 + /* Verify bad newfd_flags cannot be set */ 3868 + addfd.newfd_flags = ~O_CLOEXEC; 3869 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); 3870 + EXPECT_EQ(errno, EINVAL); 3871 + addfd.newfd_flags = O_CLOEXEC; 3872 + 3873 + /* Verify bad flags cannot be set */ 3874 + addfd.flags = 0xff; 3875 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); 3876 + EXPECT_EQ(errno, EINVAL); 3877 + addfd.flags = 0; 3878 + 3879 + /* Verify that remote_fd cannot be set without setting flags */ 3880 + addfd.newfd = 1; 3881 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); 3882 + EXPECT_EQ(errno, EINVAL); 3883 + addfd.newfd = 0; 3884 + 3885 + /* Verify small size cannot be set */ 3886 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1); 3887 + EXPECT_EQ(errno, EINVAL); 3888 + 3889 + /* Verify we can't send bits filled in unknown buffer area */ 3890 + memset(&big, 0xAA, sizeof(big)); 3891 + big.addfd = addfd; 3892 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1); 3893 + EXPECT_EQ(errno, E2BIG); 3894 + 3895 + 3896 + /* Verify we can set an arbitrary remote fd */ 3897 + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); 3898 + /* 3899 + * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd), 3900 + * 4(listener), so the newly allocated fd should be 5. 3901 + */ 3902 + EXPECT_EQ(fd, 5); 3903 + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); 3904 + 3905 + /* Verify we can set an arbitrary remote fd with large size */ 3906 + memset(&big, 0x0, sizeof(big)); 3907 + big.addfd = addfd; 3908 + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); 3909 + EXPECT_EQ(fd, 6); 3910 + 3911 + /* Verify we can set a specific remote fd */ 3912 + addfd.newfd = 42; 3913 + addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; 3914 + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); 3915 + EXPECT_EQ(fd, 42); 3916 + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); 3917 + 3918 + /* Resume syscall */ 3919 + resp.id = req.id; 3920 + resp.error = 0; 3921 + resp.val = USER_NOTIF_MAGIC; 3922 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); 3923 + 3924 + /* 3925 + * This sets the ID of the ADD FD to the last request plus 1. The 3926 + * notification ID increments 1 per notification. 3927 + */ 3928 + addfd.id = req.id + 1; 3929 + 3930 + /* This spins until the underlying notification is generated */ 3931 + while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 && 3932 + errno != -EINPROGRESS) 3933 + nanosleep(&delay, NULL); 3934 + 3935 + memset(&req, 0, sizeof(req)); 3936 + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); 3937 + ASSERT_EQ(addfd.id, req.id); 3938 + 3939 + resp.id = req.id; 3940 + resp.error = 0; 3941 + resp.val = USER_NOTIF_MAGIC; 3942 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); 3943 + 3944 + /* Wait for child to finish. */ 3945 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 3946 + EXPECT_EQ(true, WIFEXITED(status)); 3947 + EXPECT_EQ(0, WEXITSTATUS(status)); 3948 + 3949 + close(memfd); 3950 + } 3951 + 3952 + TEST(user_notification_addfd_rlimit) 3953 + { 3954 + pid_t pid; 3955 + long ret; 3956 + int status, listener, memfd; 3957 + struct seccomp_notif_addfd addfd = {}; 3958 + struct seccomp_notif req = {}; 3959 + struct seccomp_notif_resp resp = {}; 3960 + const struct rlimit lim = { 3961 + .rlim_cur = 0, 3962 + .rlim_max = 0, 3963 + }; 3964 + 3965 + memfd = memfd_create("test", 0); 3966 + ASSERT_GE(memfd, 0); 3967 + 3968 + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 3969 + ASSERT_EQ(0, ret) { 3970 + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); 3971 + } 3972 + 3973 + /* Check that the basic notification machinery works */ 3974 + listener = user_notif_syscall(__NR_getppid, 3975 + SECCOMP_FILTER_FLAG_NEW_LISTENER); 3976 + ASSERT_GE(listener, 0); 3977 + 3978 + pid = fork(); 3979 + ASSERT_GE(pid, 0); 3980 + 3981 + if (pid == 0) 3982 + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); 3983 + 3984 + 3985 + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); 3986 + 3987 + ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0); 3988 + 3989 + addfd.srcfd = memfd; 3990 + addfd.newfd_flags = O_CLOEXEC; 3991 + addfd.newfd = 0; 3992 + addfd.id = req.id; 3993 + addfd.flags = 0; 3994 + 3995 + /* Should probably spot check /proc/sys/fs/file-nr */ 3996 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); 3997 + EXPECT_EQ(errno, EMFILE); 3998 + 3999 + addfd.newfd = 100; 4000 + addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; 4001 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); 4002 + EXPECT_EQ(errno, EBADF); 4003 + 4004 + resp.id = req.id; 4005 + resp.error = 0; 4006 + resp.val = USER_NOTIF_MAGIC; 4007 + 4008 + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); 4009 + 4010 + /* Wait for child to finish. */ 4011 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 4012 + EXPECT_EQ(true, WIFEXITED(status)); 4013 + EXPECT_EQ(0, WEXITSTATUS(status)); 4014 + 4015 + close(memfd); 4016 + } 4017 + 3666 4018 /* 3667 4019 * TODO: 3668 - * - add microbenchmarks 3669 4020 * - expand NNP testing 3670 4021 * - better arch-specific TRACE and TRAP handlers. 3671 4022 * - endianness checking when appropriate ··· 4000 3697 * - arch value testing (x86 modes especially) 4001 3698 * - verify that FILTER_FLAG_LOG filters generate log messages 4002 3699 * - verify that RET_LOG generates log messages 4003 - * - ... 4004 3700 */ 4005 3701 4006 3702 TEST_HARNESS_MAIN

+1

tools/testing/selftests/seccomp/settings

··· 1 + timeout=90