Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull seccomp updates from Kees Cook:
"The major change here is finally gaining seccomp constant-action
bitmaps, which internally reduces the seccomp overhead for many
real-world syscall filters to O(1), as discussed at Plumbers this
year.

- Improve seccomp performance via constant-action bitmaps (YiFei Zhu
& Kees Cook)

- Fix bogus __user annotations (Jann Horn)

- Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"

* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
selftests/seccomp: Update kernel config
seccomp: Remove bogus __user annotations
seccomp/cache: Report cache data through /proc/pid/seccomp_cache
xtensa: Enable seccomp architecture tracking
sh: Enable seccomp architecture tracking
s390: Enable seccomp architecture tracking
riscv: Enable seccomp architecture tracking
powerpc: Enable seccomp architecture tracking
parisc: Enable seccomp architecture tracking
csky: Enable seccomp architecture tracking
arm: Enable seccomp architecture tracking
arm64: Enable seccomp architecture tracking
selftests/seccomp: Compare bitmap vs filter overhead
x86: Enable seccomp architecture tracking
seccomp/cache: Add "emulator" to check if filter is constant allow
seccomp/cache: Lookup syscall allowlist bitmap for fast path

+588 -28
+17
arch/Kconfig
··· 486 486 - secure_computing return value is checked and a return value of -1 487 487 results in the system call being skipped immediately. 488 488 - seccomp syscall wired up 489 + - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE, 490 + SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If 491 + COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too. 489 492 490 493 config SECCOMP 491 494 prompt "Enable seccomp to safely execute untrusted bytecode" ··· 516 513 task-defined system call filtering polices. 517 514 518 515 See Documentation/userspace-api/seccomp_filter.rst for details. 516 + 517 + config SECCOMP_CACHE_DEBUG 518 + bool "Show seccomp filter cache status in /proc/pid/seccomp_cache" 519 + depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR 520 + depends on PROC_FS 521 + help 522 + This enables the /proc/pid/seccomp_cache interface to monitor 523 + seccomp cache data. The file format is subject to change. Reading 524 + the file requires CAP_SYS_ADMIN. 525 + 526 + This option is for debugging only. Enabling presents the risk that 527 + an adversary may be able to infer the seccomp filter logic. 528 + 529 + If unsure, say N. 519 530 520 531 config HAVE_ARCH_STACKLEAK 521 532 bool
-1
arch/arm/include/asm/Kbuild
··· 4 4 generic-y += flat.h 5 5 generic-y += local64.h 6 6 generic-y += parport.h 7 - generic-y += seccomp.h 8 7 9 8 generated-y += mach-types.h 10 9 generated-y += unistd-nr.h
+11
arch/arm/include/asm/seccomp.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _ASM_SECCOMP_H 3 + #define _ASM_SECCOMP_H 4 + 5 + #include <asm-generic/seccomp.h> 6 + 7 + #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM 8 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 9 + #define SECCOMP_ARCH_NATIVE_NAME "arm" 10 + 11 + #endif /* _ASM_SECCOMP_H */
+9
arch/arm64/include/asm/seccomp.h
··· 19 19 20 20 #include <asm-generic/seccomp.h> 21 21 22 + #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64 23 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 24 + #define SECCOMP_ARCH_NATIVE_NAME "aarch64" 25 + #ifdef CONFIG_COMPAT 26 + # define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM 27 + # define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls 28 + # define SECCOMP_ARCH_COMPAT_NAME "arm" 29 + #endif 30 + 22 31 #endif /* _ASM_SECCOMP_H */
-1
arch/csky/include/asm/Kbuild
··· 4 4 generic-y += kvm_para.h 5 5 generic-y += local64.h 6 6 generic-y += qrwlock.h 7 - generic-y += seccomp.h 8 7 generic-y += user.h 9 8 generic-y += vmlinux.lds.h
+11
arch/csky/include/asm/seccomp.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _ASM_SECCOMP_H 3 + #define _ASM_SECCOMP_H 4 + 5 + #include <asm-generic/seccomp.h> 6 + 7 + #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY 8 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 9 + #define SECCOMP_ARCH_NATIVE_NAME "csky" 10 + 11 + #endif /* _ASM_SECCOMP_H */
-1
arch/parisc/include/asm/Kbuild
··· 5 5 generic-y += kvm_para.h 6 6 generic-y += local64.h 7 7 generic-y += mcs_spinlock.h 8 - generic-y += seccomp.h 9 8 generic-y += user.h
+22
arch/parisc/include/asm/seccomp.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _ASM_SECCOMP_H 3 + #define _ASM_SECCOMP_H 4 + 5 + #include <asm-generic/seccomp.h> 6 + 7 + #ifdef CONFIG_64BIT 8 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64 9 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 10 + # define SECCOMP_ARCH_NATIVE_NAME "parisc64" 11 + # ifdef CONFIG_COMPAT 12 + # define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC 13 + # define SECCOMP_ARCH_COMPAT_NR NR_syscalls 14 + # define SECCOMP_ARCH_COMPAT_NAME "parisc" 15 + # endif 16 + #else /* !CONFIG_64BIT */ 17 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC 18 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 19 + # define SECCOMP_ARCH_NATIVE_NAME "parisc" 20 + #endif 21 + 22 + #endif /* _ASM_SECCOMP_H */
+23
arch/powerpc/include/asm/seccomp.h
··· 8 8 9 9 #include <asm-generic/seccomp.h> 10 10 11 + #ifdef __LITTLE_ENDIAN__ 12 + #define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE 13 + #define __SECCOMP_ARCH_LE_NAME "le" 14 + #else 15 + #define __SECCOMP_ARCH_LE 0 16 + #define __SECCOMP_ARCH_LE_NAME 17 + #endif 18 + 19 + #ifdef CONFIG_PPC64 20 + # define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE) 21 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 22 + # define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME 23 + # ifdef CONFIG_COMPAT 24 + # define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) 25 + # define SECCOMP_ARCH_COMPAT_NR NR_syscalls 26 + # define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME 27 + # endif 28 + #else /* !CONFIG_PPC64 */ 29 + # define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) 30 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 31 + # define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME 32 + #endif 33 + 11 34 #endif /* _ASM_POWERPC_SECCOMP_H */
+10
arch/riscv/include/asm/seccomp.h
··· 7 7 8 8 #include <asm-generic/seccomp.h> 9 9 10 + #ifdef CONFIG_64BIT 11 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64 12 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 13 + # define SECCOMP_ARCH_NATIVE_NAME "riscv64" 14 + #else /* !CONFIG_64BIT */ 15 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32 16 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 17 + # define SECCOMP_ARCH_NATIVE_NAME "riscv32" 18 + #endif 19 + 10 20 #endif /* _ASM_SECCOMP_H */
+9
arch/s390/include/asm/seccomp.h
··· 16 16 17 17 #include <asm-generic/seccomp.h> 18 18 19 + #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X 20 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 21 + #define SECCOMP_ARCH_NATIVE_NAME "s390x" 22 + #ifdef CONFIG_COMPAT 23 + # define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 24 + # define SECCOMP_ARCH_COMPAT_NR NR_syscalls 25 + # define SECCOMP_ARCH_COMPAT_NAME "s390" 26 + #endif 27 + 19 28 #endif /* _ASM_S390_SECCOMP_H */
+10
arch/sh/include/asm/seccomp.h
··· 8 8 #define __NR_seccomp_exit __NR_exit 9 9 #define __NR_seccomp_sigreturn __NR_rt_sigreturn 10 10 11 + #ifdef CONFIG_CPU_LITTLE_ENDIAN 12 + #define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE 13 + #else 14 + #define __SECCOMP_ARCH_LE 0 15 + #endif 16 + 17 + #define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE) 18 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 19 + #define SECCOMP_ARCH_NATIVE_NAME "sh" 20 + 11 21 #endif /* __ASM_SECCOMP_H */
+20
arch/x86/include/asm/seccomp.h
··· 16 16 #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn 17 17 #endif 18 18 19 + #ifdef CONFIG_X86_64 20 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64 21 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 22 + # define SECCOMP_ARCH_NATIVE_NAME "x86_64" 23 + # ifdef CONFIG_COMPAT 24 + # define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 25 + # define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls 26 + # define SECCOMP_ARCH_COMPAT_NAME "ia32" 27 + # endif 28 + /* 29 + * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support 30 + * caching them and they are treated as out of range syscalls, which will 31 + * always pass through the BPF filter. 32 + */ 33 + #else /* !CONFIG_X86_64 */ 34 + # define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386 35 + # define SECCOMP_ARCH_NATIVE_NR NR_syscalls 36 + # define SECCOMP_ARCH_NATIVE_NAME "ia32" 37 + #endif 38 + 19 39 #include <asm-generic/seccomp.h> 20 40 21 41 #endif /* _ASM_X86_SECCOMP_H */
-1
arch/xtensa/include/asm/Kbuild
··· 7 7 generic-y += param.h 8 8 generic-y += qrwlock.h 9 9 generic-y += qspinlock.h 10 - generic-y += seccomp.h 11 10 generic-y += user.h
+11
arch/xtensa/include/asm/seccomp.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _ASM_SECCOMP_H 3 + #define _ASM_SECCOMP_H 4 + 5 + #include <asm-generic/seccomp.h> 6 + 7 + #define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA 8 + #define SECCOMP_ARCH_NATIVE_NR NR_syscalls 9 + #define SECCOMP_ARCH_NATIVE_NAME "xtensa" 10 + 11 + #endif /* _ASM_SECCOMP_H */
+6
fs/proc/base.c
··· 3263 3263 #ifdef CONFIG_PROC_PID_ARCH_STATUS 3264 3264 ONE("arch_status", S_IRUGO, proc_pid_arch_status), 3265 3265 #endif 3266 + #ifdef CONFIG_SECCOMP_CACHE_DEBUG 3267 + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), 3268 + #endif 3266 3269 }; 3267 3270 3268 3271 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) ··· 3594 3591 #endif 3595 3592 #ifdef CONFIG_PROC_PID_ARCH_STATUS 3596 3593 ONE("arch_status", S_IRUGO, proc_pid_arch_status), 3594 + #endif 3595 + #ifdef CONFIG_SECCOMP_CACHE_DEBUG 3596 + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), 3597 3597 #endif 3598 3598 }; 3599 3599
+7
include/linux/seccomp.h
··· 121 121 return -EINVAL; 122 122 } 123 123 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ 124 + 125 + #ifdef CONFIG_SECCOMP_CACHE_DEBUG 126 + struct seq_file; 127 + 128 + int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, 129 + struct pid *pid, struct task_struct *task); 130 + #endif 124 131 #endif /* _LINUX_SECCOMP_H */
+293 -3
kernel/seccomp.c
··· 143 143 struct list_head notifications; 144 144 }; 145 145 146 + #ifdef SECCOMP_ARCH_NATIVE 147 + /** 148 + * struct action_cache - per-filter cache of seccomp actions per 149 + * arch/syscall pair 150 + * 151 + * @allow_native: A bitmap where each bit represents whether the 152 + * filter will always allow the syscall, for the 153 + * native architecture. 154 + * @allow_compat: A bitmap where each bit represents whether the 155 + * filter will always allow the syscall, for the 156 + * compat architecture. 157 + */ 158 + struct action_cache { 159 + DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); 160 + #ifdef SECCOMP_ARCH_COMPAT 161 + DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); 162 + #endif 163 + }; 164 + #else 165 + struct action_cache { }; 166 + 167 + static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, 168 + const struct seccomp_data *sd) 169 + { 170 + return false; 171 + } 172 + 173 + static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) 174 + { 175 + } 176 + #endif /* SECCOMP_ARCH_NATIVE */ 177 + 146 178 /** 147 179 * struct seccomp_filter - container for seccomp BPF programs 148 180 * ··· 191 159 * this filter after reaching 0. The @users count is always smaller 192 160 * or equal to @refs. Hence, reaching 0 for @users does not mean 193 161 * the filter can be freed. 162 + * @cache: cache of arch/syscall mappings to actions 194 163 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged 195 164 * @prev: points to a previously installed, or inherited, filter 196 165 * @prog: the BPF program to evaluate ··· 213 180 refcount_t refs; 214 181 refcount_t users; 215 182 bool log; 183 + struct action_cache cache; 216 184 struct seccomp_filter *prev; 217 185 struct bpf_prog *prog; 218 186 struct notification *notif; ··· 332 298 return 0; 333 299 } 334 300 301 + #ifdef SECCOMP_ARCH_NATIVE 302 + static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, 303 + size_t bitmap_size, 304 + int syscall_nr) 305 + { 306 + if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) 307 + return false; 308 + syscall_nr = array_index_nospec(syscall_nr, bitmap_size); 309 + 310 + return test_bit(syscall_nr, bitmap); 311 + } 312 + 313 + /** 314 + * seccomp_cache_check_allow - lookup seccomp cache 315 + * @sfilter: The seccomp filter 316 + * @sd: The seccomp data to lookup the cache with 317 + * 318 + * Returns true if the seccomp_data is cached and allowed. 319 + */ 320 + static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, 321 + const struct seccomp_data *sd) 322 + { 323 + int syscall_nr = sd->nr; 324 + const struct action_cache *cache = &sfilter->cache; 325 + 326 + #ifndef SECCOMP_ARCH_COMPAT 327 + /* A native-only architecture doesn't need to check sd->arch. */ 328 + return seccomp_cache_check_allow_bitmap(cache->allow_native, 329 + SECCOMP_ARCH_NATIVE_NR, 330 + syscall_nr); 331 + #else 332 + if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) 333 + return seccomp_cache_check_allow_bitmap(cache->allow_native, 334 + SECCOMP_ARCH_NATIVE_NR, 335 + syscall_nr); 336 + if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) 337 + return seccomp_cache_check_allow_bitmap(cache->allow_compat, 338 + SECCOMP_ARCH_COMPAT_NR, 339 + syscall_nr); 340 + #endif /* SECCOMP_ARCH_COMPAT */ 341 + 342 + WARN_ON_ONCE(true); 343 + return false; 344 + } 345 + #endif /* SECCOMP_ARCH_NATIVE */ 346 + 335 347 /** 336 348 * seccomp_run_filters - evaluates all seccomp filters against @sd 337 349 * @sd: optional seccomp data to be passed to filters ··· 399 319 /* Ensure unexpected behavior doesn't result in failing open. */ 400 320 if (WARN_ON(f == NULL)) 401 321 return SECCOMP_RET_KILL_PROCESS; 322 + 323 + if (seccomp_cache_check_allow(f, sd)) 324 + return SECCOMP_RET_ALLOW; 402 325 403 326 /* 404 327 * All filters in the list are evaluated and the lowest BPF return ··· 553 470 { 554 471 struct seccomp_filter *orig = tsk->seccomp.filter; 555 472 473 + /* We are effectively holding the siglock by not having any sighand. */ 474 + WARN_ON(tsk->sighand != NULL); 475 + 556 476 /* Detach task from its filter tree. */ 557 477 tsk->seccomp.filter = NULL; 558 478 __seccomp_filter_release(orig); ··· 630 544 { 631 545 struct seccomp_filter *sfilter; 632 546 int ret; 633 - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); 547 + const bool save_orig = 548 + #if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) 549 + true; 550 + #else 551 + false; 552 + #endif 634 553 635 554 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 636 555 return ERR_PTR(-EINVAL); ··· 700 609 return filter; 701 610 } 702 611 612 + #ifdef SECCOMP_ARCH_NATIVE 613 + /** 614 + * seccomp_is_const_allow - check if filter is constant allow with given data 615 + * @fprog: The BPF programs 616 + * @sd: The seccomp data to check against, only syscall number and arch 617 + * number are considered constant. 618 + */ 619 + static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, 620 + struct seccomp_data *sd) 621 + { 622 + unsigned int reg_value = 0; 623 + unsigned int pc; 624 + bool op_res; 625 + 626 + if (WARN_ON_ONCE(!fprog)) 627 + return false; 628 + 629 + for (pc = 0; pc < fprog->len; pc++) { 630 + struct sock_filter *insn = &fprog->filter[pc]; 631 + u16 code = insn->code; 632 + u32 k = insn->k; 633 + 634 + switch (code) { 635 + case BPF_LD | BPF_W | BPF_ABS: 636 + switch (k) { 637 + case offsetof(struct seccomp_data, nr): 638 + reg_value = sd->nr; 639 + break; 640 + case offsetof(struct seccomp_data, arch): 641 + reg_value = sd->arch; 642 + break; 643 + default: 644 + /* can't optimize (non-constant value load) */ 645 + return false; 646 + } 647 + break; 648 + case BPF_RET | BPF_K: 649 + /* reached return with constant values only, check allow */ 650 + return k == SECCOMP_RET_ALLOW; 651 + case BPF_JMP | BPF_JA: 652 + pc += insn->k; 653 + break; 654 + case BPF_JMP | BPF_JEQ | BPF_K: 655 + case BPF_JMP | BPF_JGE | BPF_K: 656 + case BPF_JMP | BPF_JGT | BPF_K: 657 + case BPF_JMP | BPF_JSET | BPF_K: 658 + switch (BPF_OP(code)) { 659 + case BPF_JEQ: 660 + op_res = reg_value == k; 661 + break; 662 + case BPF_JGE: 663 + op_res = reg_value >= k; 664 + break; 665 + case BPF_JGT: 666 + op_res = reg_value > k; 667 + break; 668 + case BPF_JSET: 669 + op_res = !!(reg_value & k); 670 + break; 671 + default: 672 + /* can't optimize (unknown jump) */ 673 + return false; 674 + } 675 + 676 + pc += op_res ? insn->jt : insn->jf; 677 + break; 678 + case BPF_ALU | BPF_AND | BPF_K: 679 + reg_value &= k; 680 + break; 681 + default: 682 + /* can't optimize (unknown insn) */ 683 + return false; 684 + } 685 + } 686 + 687 + /* ran off the end of the filter?! */ 688 + WARN_ON(1); 689 + return false; 690 + } 691 + 692 + static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, 693 + void *bitmap, const void *bitmap_prev, 694 + size_t bitmap_size, int arch) 695 + { 696 + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; 697 + struct seccomp_data sd; 698 + int nr; 699 + 700 + if (bitmap_prev) { 701 + /* The new filter must be as restrictive as the last. */ 702 + bitmap_copy(bitmap, bitmap_prev, bitmap_size); 703 + } else { 704 + /* Before any filters, all syscalls are always allowed. */ 705 + bitmap_fill(bitmap, bitmap_size); 706 + } 707 + 708 + for (nr = 0; nr < bitmap_size; nr++) { 709 + /* No bitmap change: not a cacheable action. */ 710 + if (!test_bit(nr, bitmap)) 711 + continue; 712 + 713 + sd.nr = nr; 714 + sd.arch = arch; 715 + 716 + /* No bitmap change: continue to always allow. */ 717 + if (seccomp_is_const_allow(fprog, &sd)) 718 + continue; 719 + 720 + /* 721 + * Not a cacheable action: always run filters. 722 + * atomic clear_bit() not needed, filter not visible yet. 723 + */ 724 + __clear_bit(nr, bitmap); 725 + } 726 + } 727 + 728 + /** 729 + * seccomp_cache_prepare - emulate the filter to find cachable syscalls 730 + * @sfilter: The seccomp filter 731 + * 732 + * Returns 0 if successful or -errno if error occurred. 733 + */ 734 + static void seccomp_cache_prepare(struct seccomp_filter *sfilter) 735 + { 736 + struct action_cache *cache = &sfilter->cache; 737 + const struct action_cache *cache_prev = 738 + sfilter->prev ? &sfilter->prev->cache : NULL; 739 + 740 + seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, 741 + cache_prev ? cache_prev->allow_native : NULL, 742 + SECCOMP_ARCH_NATIVE_NR, 743 + SECCOMP_ARCH_NATIVE); 744 + 745 + #ifdef SECCOMP_ARCH_COMPAT 746 + seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, 747 + cache_prev ? cache_prev->allow_compat : NULL, 748 + SECCOMP_ARCH_COMPAT_NR, 749 + SECCOMP_ARCH_COMPAT); 750 + #endif /* SECCOMP_ARCH_COMPAT */ 751 + } 752 + #endif /* SECCOMP_ARCH_NATIVE */ 753 + 703 754 /** 704 755 * seccomp_attach_filter: validate and attach filter 705 756 * @flags: flags to change filter behavior ··· 891 658 * task reference. 892 659 */ 893 660 filter->prev = current->seccomp.filter; 661 + seccomp_cache_prepare(filter); 894 662 current->seccomp.filter = filter; 895 663 atomic_inc(&current->seccomp.filter_count); 896 664 ··· 2201 1967 return true; 2202 1968 } 2203 1969 2204 - static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, 1970 + static int read_actions_logged(struct ctl_table *ro_table, void *buffer, 2205 1971 size_t *lenp, loff_t *ppos) 2206 1972 { 2207 1973 char names[sizeof(seccomp_actions_avail)]; ··· 2219 1985 return proc_dostring(&table, 0, buffer, lenp, ppos); 2220 1986 } 2221 1987 2222 - static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, 1988 + static int write_actions_logged(struct ctl_table *ro_table, void *buffer, 2223 1989 size_t *lenp, loff_t *ppos, u32 *actions_logged) 2224 1990 { 2225 1991 char names[sizeof(seccomp_actions_avail)]; ··· 2337 2103 device_initcall(seccomp_sysctl_init) 2338 2104 2339 2105 #endif /* CONFIG_SYSCTL */ 2106 + 2107 + #ifdef CONFIG_SECCOMP_CACHE_DEBUG 2108 + /* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ 2109 + static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, 2110 + const void *bitmap, size_t bitmap_size) 2111 + { 2112 + int nr; 2113 + 2114 + for (nr = 0; nr < bitmap_size; nr++) { 2115 + bool cached = test_bit(nr, bitmap); 2116 + char *status = cached ? "ALLOW" : "FILTER"; 2117 + 2118 + seq_printf(m, "%s %d %s\n", name, nr, status); 2119 + } 2120 + } 2121 + 2122 + int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, 2123 + struct pid *pid, struct task_struct *task) 2124 + { 2125 + struct seccomp_filter *f; 2126 + unsigned long flags; 2127 + 2128 + /* 2129 + * We don't want some sandboxed process to know what their seccomp 2130 + * filters consist of. 2131 + */ 2132 + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) 2133 + return -EACCES; 2134 + 2135 + if (!lock_task_sighand(task, &flags)) 2136 + return -ESRCH; 2137 + 2138 + f = READ_ONCE(task->seccomp.filter); 2139 + if (!f) { 2140 + unlock_task_sighand(task, &flags); 2141 + return 0; 2142 + } 2143 + 2144 + /* prevent filter from being freed while we are printing it */ 2145 + __get_seccomp_filter(f); 2146 + unlock_task_sighand(task, &flags); 2147 + 2148 + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME, 2149 + f->cache.allow_native, 2150 + SECCOMP_ARCH_NATIVE_NR); 2151 + 2152 + #ifdef SECCOMP_ARCH_COMPAT 2153 + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME, 2154 + f->cache.allow_compat, 2155 + SECCOMP_ARCH_COMPAT_NR); 2156 + #endif /* SECCOMP_ARCH_COMPAT */ 2157 + 2158 + __put_seccomp_filter(f); 2159 + return 0; 2160 + } 2161 + #endif /* CONFIG_SECCOMP_CACHE_DEBUG */
+1
tools/testing/selftests/seccomp/config
··· 1 + CONFIG_PID_NS=y 1 2 CONFIG_SECCOMP=y 2 3 CONFIG_SECCOMP_FILTER=y 3 4 CONFIG_USER_NS=y
+127 -20
tools/testing/selftests/seccomp/seccomp_benchmark.c
··· 4 4 */ 5 5 #define _GNU_SOURCE 6 6 #include <assert.h> 7 + #include <limits.h> 8 + #include <stdbool.h> 9 + #include <stddef.h> 7 10 #include <stdio.h> 8 11 #include <stdlib.h> 9 12 #include <time.h> 10 13 #include <unistd.h> 11 14 #include <linux/filter.h> 12 15 #include <linux/seccomp.h> 16 + #include <sys/param.h> 13 17 #include <sys/prctl.h> 14 18 #include <sys/syscall.h> 15 19 #include <sys/types.h> ··· 74 70 return samples * seconds; 75 71 } 76 72 73 + bool approx(int i_one, int i_two) 74 + { 75 + double one = i_one, one_bump = one * 0.01; 76 + double two = i_two, two_bump = two * 0.01; 77 + 78 + one_bump = one + MAX(one_bump, 2.0); 79 + two_bump = two + MAX(two_bump, 2.0); 80 + 81 + /* Equal to, or within 1% or 2 digits */ 82 + if (one == two || 83 + (one > two && one <= two_bump) || 84 + (two > one && two <= one_bump)) 85 + return true; 86 + return false; 87 + } 88 + 89 + bool le(int i_one, int i_two) 90 + { 91 + if (i_one <= i_two) 92 + return true; 93 + return false; 94 + } 95 + 96 + long compare(const char *name_one, const char *name_eval, const char *name_two, 97 + unsigned long long one, bool (*eval)(int, int), unsigned long long two) 98 + { 99 + bool good; 100 + 101 + printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, 102 + (long long)one, name_eval, (long long)two); 103 + if (one > INT_MAX) { 104 + printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); 105 + return 1; 106 + } 107 + if (two > INT_MAX) { 108 + printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); 109 + return 1; 110 + } 111 + 112 + good = eval(one, two); 113 + printf("%s\n", good ? "✔️" : "❌"); 114 + 115 + return good ? 0 : 1; 116 + } 117 + 77 118 int main(int argc, char *argv[]) 78 119 { 120 + struct sock_filter bitmap_filter[] = { 121 + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), 122 + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), 123 + }; 124 + struct sock_fprog bitmap_prog = { 125 + .len = (unsigned short)ARRAY_SIZE(bitmap_filter), 126 + .filter = bitmap_filter, 127 + }; 79 128 struct sock_filter filter[] = { 129 + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])), 80 130 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), 81 131 }; 82 132 struct sock_fprog prog = { 83 133 .len = (unsigned short)ARRAY_SIZE(filter), 84 134 .filter = filter, 85 135 }; 86 - long ret; 87 - unsigned long long samples; 88 - unsigned long long native, filter1, filter2; 136 + 137 + long ret, bits; 138 + unsigned long long samples, calc; 139 + unsigned long long native, filter1, filter2, bitmap1, bitmap2; 140 + unsigned long long entry, per_filter1, per_filter2; 89 141 90 142 printf("Current BPF sysctl settings:\n"); 91 143 system("sysctl net.core.bpf_jit_enable"); ··· 161 101 ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 162 102 assert(ret == 0); 163 103 164 - /* One filter */ 104 + /* One filter resulting in a bitmap */ 105 + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); 106 + assert(ret == 0); 107 + 108 + bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 109 + printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); 110 + 111 + /* Second filter resulting in a bitmap */ 112 + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); 113 + assert(ret == 0); 114 + 115 + bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 116 + printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); 117 + 118 + /* Third filter, can no longer be converted to bitmap */ 165 119 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); 166 120 assert(ret == 0); 167 121 168 122 filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 169 - printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); 123 + printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); 170 124 171 - if (filter1 == native) 172 - printf("No overhead measured!? Try running again with more samples.\n"); 173 - 174 - /* Two filters */ 175 - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); 125 + /* Fourth filter, can not be converted to bitmap because of filter 3 */ 126 + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); 176 127 assert(ret == 0); 177 128 178 129 filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; 179 - printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); 130 + printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); 180 131 181 - /* Calculations */ 182 - printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", 183 - filter1 - native); 132 + /* Estimations */ 133 + #define ESTIMATE(fmt, var, what) do { \ 134 + var = (what); \ 135 + printf("Estimated " fmt ": %llu ns\n", var); \ 136 + if (var > INT_MAX) \ 137 + goto more_samples; \ 138 + } while (0) 184 139 185 - printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", 186 - filter2 - native); 140 + ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, 141 + bitmap1 - native); 142 + ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc, 143 + bitmap2 - native); 144 + ESTIMATE("total seccomp overhead for 3 full filters", calc, 145 + filter1 - native); 146 + ESTIMATE("total seccomp overhead for 4 full filters", calc, 147 + filter2 - native); 148 + ESTIMATE("seccomp entry overhead", entry, 149 + bitmap1 - native - (bitmap2 - bitmap1)); 150 + ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1, 151 + filter2 - filter1); 152 + ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, 153 + (filter2 - native - entry) / 4); 187 154 188 - printf("Estimated seccomp per-filter overhead: %llu ns\n", 189 - filter2 - filter1); 155 + printf("Expectations:\n"); 156 + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); 157 + bits = compare("native", "≤", "1 filter", native, le, filter1); 158 + if (bits) 159 + goto more_samples; 190 160 191 - printf("Estimated seccomp entry overhead: %llu ns\n", 192 - filter1 - native - (filter2 - filter1)); 161 + ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", 162 + per_filter1, approx, per_filter2); 193 163 164 + bits = compare("1 bitmapped", "≈", "2 bitmapped", 165 + bitmap1 - native, approx, bitmap2 - native); 166 + if (bits) { 167 + printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); 168 + goto out; 169 + } 170 + 171 + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); 172 + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); 173 + ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", 174 + entry + (per_filter1 * 4) + native, approx, filter2); 175 + if (ret == 0) 176 + goto out; 177 + 178 + more_samples: 179 + printf("Saw unexpected benchmark result. Try running again with more samples?\n"); 180 + out: 194 181 return 0; 195 182 }
+1 -1
tools/testing/selftests/seccomp/settings
··· 1 - timeout=90 1 + timeout=120