Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/syscalls: Split the x32 syscalls into their own table

For unfortunate historical reasons, the x32 syscalls and the x86_64
syscalls are not all numbered the same. As an example, ioctl() is nr 16 on
x86_64 but 514 on x32.

This has potentially nasty consequences, since it means that there are two
valid RAX values to do ioctl(2) and two invalid RAX values. The valid
values are 16 (i.e. ioctl(2) using the x86_64 ABI) and (514 | 0x40000000)
(i.e. ioctl(2) using the x32 ABI).

The invalid values are 514 and (16 | 0x40000000). 514 will enter the
"COMPAT_SYSCALL_DEFINE3(ioctl, ...)" entry point with in_compat_syscall()
and in_x32_syscall() returning false, whereas (16 | 0x40000000) will enter
the native entry point with in_compat_syscall() and in_x32_syscall()
returning true. Both are bogus, and both will exercise code paths in the
kernel and in any running seccomp filters that really ought to be
unreachable.

Splitting out the x32 syscalls into their own tables, allows both bogus
invocations to return -ENOSYS. I've checked glibc, musl, and Bionic, and
all of them appear to call syscalls with their correct numbers, so this
change should have no effect on them.

There is an added benefit going forward: new syscalls that need special
handling on x32 can share the same number on x32 and x86_64. This means
that the special syscall range 512-547 can be treated as a legacy wart
instead of something that may need to be extended in the future.

Also add a selftest to verify the new behavior.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/208024256b764312598f014ebfb0a42472c19354.1562185330.git.luto@kernel.org

authored by

Andy Lutomirski and committed by
Thomas Gleixner
6365b842 f85a8573

+163 -27
+7 -6
arch/x86/entry/common.c
··· 285 285 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) 286 286 nr = syscall_trace_enter(regs); 287 287 288 - /* 289 - * NB: Native and x32 syscalls are dispatched from the same 290 - * table. The only functional difference is the x32 bit in 291 - * regs->orig_ax, which changes the behavior of some syscalls. 292 - */ 293 - nr &= __SYSCALL_MASK; 294 288 if (likely(nr < NR_syscalls)) { 295 289 nr = array_index_nospec(nr, NR_syscalls); 296 290 regs->ax = sys_call_table[nr](regs); 291 + #ifdef CONFIG_X86_X32_ABI 292 + } else if (likely((nr & __X32_SYSCALL_BIT) && 293 + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { 294 + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, 295 + X32_NR_syscalls); 296 + regs->ax = x32_sys_call_table[nr](regs); 297 + #endif 297 298 } 298 299 299 300 syscall_return_slowpath(regs);
+25
arch/x86/entry/syscall_64.c
··· 10 10 /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ 11 11 extern asmlinkage long sys_ni_syscall(const struct pt_regs *); 12 12 #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); 13 + #define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) 13 14 #include <asm/syscalls_64.h> 14 15 #undef __SYSCALL_64 16 + #undef __SYSCALL_X32 15 17 16 18 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, 19 + #define __SYSCALL_X32(nr, sym, qual) 17 20 18 21 asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 19 22 /* ··· 26 23 [0 ... __NR_syscall_max] = &sys_ni_syscall, 27 24 #include <asm/syscalls_64.h> 28 25 }; 26 + 27 + #undef __SYSCALL_64 28 + #undef __SYSCALL_X32 29 + 30 + #ifdef CONFIG_X86_X32_ABI 31 + 32 + #define __SYSCALL_64(nr, sym, qual) 33 + #define __SYSCALL_X32(nr, sym, qual) [nr] = sym, 34 + 35 + asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { 36 + /* 37 + * Smells like a compiler bug -- it doesn't work 38 + * when the & below is removed. 39 + */ 40 + [0 ... __NR_syscall_x32_max] = &sys_ni_syscall, 41 + #include <asm/syscalls_64.h> 42 + }; 43 + 44 + #undef __SYSCALL_64 45 + #undef __SYSCALL_X32 46 + 47 + #endif
+17 -14
arch/x86/entry/syscalls/syscalltbl.sh
··· 1 - #!/bin/sh 1 + #!/bin/bash 2 2 # SPDX-License-Identifier: GPL-2.0 3 3 4 4 in="$1" 5 5 out="$2" 6 6 7 7 syscall_macro() { 8 - abi="$1" 9 - nr="$2" 10 - entry="$3" 8 + local abi="$1" 9 + local nr="$2" 10 + local entry="$3" 11 11 12 12 # Entry can be either just a function name or "function/qualifier" 13 13 real_entry="${entry%%/*}" ··· 21 21 } 22 22 23 23 emit() { 24 - abi="$1" 25 - nr="$2" 26 - entry="$3" 27 - compat="$4" 28 - umlentry="" 24 + local abi="$1" 25 + local nr="$2" 26 + local entry="$3" 27 + local compat="$4" 28 + local umlentry="" 29 29 30 30 if [ "$abi" != "I386" -a -n "$compat" ]; then 31 31 echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 ··· 62 62 while read nr abi name entry compat; do 63 63 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` 64 64 if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then 65 - # COMMON is the same as 64, except that we don't expect X32 66 - # programs to use it. Our expectation has nothing to do with 67 - # any generated code, so treat them the same. 68 65 emit 64 "$nr" "$entry" "$compat" 66 + if [ "$abi" = "COMMON" ]; then 67 + # COMMON means that this syscall exists in the same form for 68 + # 64-bit and X32. 69 + echo "#ifdef CONFIG_X86_X32_ABI" 70 + emit X32 "$nr" "$entry" "$compat" 71 + echo "#endif" 72 + fi 69 73 elif [ "$abi" = "X32" ]; then 70 - # X32 is equivalent to 64 on an X32-compatible kernel. 71 74 echo "#ifdef CONFIG_X86_X32_ABI" 72 - emit 64 "$nr" "$entry" "$compat" 75 + emit X32 "$nr" "$entry" "$compat" 73 76 echo "#endif" 74 77 elif [ "$abi" = "I386" ]; then 75 78 emit "$abi" "$nr" "$entry" "$compat"
+4
arch/x86/include/asm/syscall.h
··· 36 36 extern const sys_call_ptr_t ia32_sys_call_table[]; 37 37 #endif 38 38 39 + #ifdef CONFIG_X86_X32_ABI 40 + extern const sys_call_ptr_t x32_sys_call_table[]; 41 + #endif 42 + 39 43 /* 40 44 * Only the low 32 bits of orig_ax are meaningful, so we return int. 41 45 * This importantly ignores the high bits on 64-bit, so comparisons
-6
arch/x86/include/asm/unistd.h
··· 5 5 #include <uapi/asm/unistd.h> 6 6 7 7 8 - # ifdef CONFIG_X86_X32_ABI 9 - # define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) 10 - # else 11 - # define __SYSCALL_MASK (~0) 12 - # endif 13 - 14 8 # ifdef CONFIG_X86_32 15 9 16 10 # include <asm/unistd_32.h>
+20
arch/x86/kernel/asm-offsets_64.c
··· 6 6 #include <asm/ia32.h> 7 7 8 8 #define __SYSCALL_64(nr, sym, qual) [nr] = 1, 9 + #define __SYSCALL_X32(nr, sym, qual) 9 10 static char syscalls_64[] = { 10 11 #include <asm/syscalls_64.h> 11 12 }; 13 + #undef __SYSCALL_64 14 + #undef __SYSCALL_X32 15 + 16 + #ifdef CONFIG_X86_X32_ABI 17 + #define __SYSCALL_64(nr, sym, qual) 18 + #define __SYSCALL_X32(nr, sym, qual) [nr] = 1, 19 + static char syscalls_x32[] = { 20 + #include <asm/syscalls_64.h> 21 + }; 22 + #undef __SYSCALL_64 23 + #undef __SYSCALL_X32 24 + #endif 25 + 12 26 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, 13 27 static char syscalls_ia32[] = { 14 28 #include <asm/syscalls_32.h> 15 29 }; 30 + #undef __SYSCALL_I386 16 31 17 32 #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) 18 33 #include <asm/kvm_para.h> ··· 94 79 95 80 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 96 81 DEFINE(NR_syscalls, sizeof(syscalls_64)); 82 + 83 + #ifdef CONFIG_X86_X32_ABI 84 + DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); 85 + DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); 86 + #endif 97 87 98 88 DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); 99 89 DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
+1 -1
tools/testing/selftests/x86/Makefile
··· 17 17 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ 18 18 test_FCMOV test_FCOMI test_FISTTP \ 19 19 vdso_restorer 20 - TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip 20 + TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering 21 21 # Some selftests require 32bit support enabled also on 64bit systems 22 22 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall 23 23
+89
tools/testing/selftests/x86/syscall_numbering.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args 4 + * Copyright (c) 2018 Andrew Lutomirski 5 + */ 6 + 7 + #define _GNU_SOURCE 8 + 9 + #include <stdlib.h> 10 + #include <stdio.h> 11 + #include <stdbool.h> 12 + #include <errno.h> 13 + #include <unistd.h> 14 + #include <syscall.h> 15 + 16 + static int nerrs; 17 + 18 + #define X32_BIT 0x40000000UL 19 + 20 + static void check_enosys(unsigned long nr, bool *ok) 21 + { 22 + /* If this fails, a segfault is reasonably likely. */ 23 + fflush(stdout); 24 + 25 + long ret = syscall(nr, 0, 0, 0, 0, 0, 0); 26 + if (ret == 0) { 27 + printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr); 28 + *ok = false; 29 + } else if (errno != ENOSYS) { 30 + printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno); 31 + *ok = false; 32 + } 33 + } 34 + 35 + static void test_x32_without_x32_bit(void) 36 + { 37 + bool ok = true; 38 + 39 + /* 40 + * Syscalls 512-547 are "x32" syscalls. They are intended to be 41 + * called with the x32 (0x40000000) bit set. Calling them without 42 + * the x32 bit set is nonsense and should not work. 43 + */ 44 + printf("[RUN]\tChecking syscalls 512-547\n"); 45 + for (int i = 512; i <= 547; i++) 46 + check_enosys(i, &ok); 47 + 48 + /* 49 + * Check that a handful of 64-bit-only syscalls are rejected if the x32 50 + * bit is set. 51 + */ 52 + printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n"); 53 + check_enosys(16 | X32_BIT, &ok); /* ioctl */ 54 + check_enosys(19 | X32_BIT, &ok); /* readv */ 55 + check_enosys(20 | X32_BIT, &ok); /* writev */ 56 + 57 + /* 58 + * Check some syscalls with high bits set. 59 + */ 60 + printf("[RUN]\tChecking numbers above 2^32-1\n"); 61 + check_enosys((1UL << 32), &ok); 62 + check_enosys(X32_BIT | (1UL << 32), &ok); 63 + 64 + if (!ok) 65 + nerrs++; 66 + else 67 + printf("[OK]\tThey all returned -ENOSYS\n"); 68 + } 69 + 70 + int main() 71 + { 72 + /* 73 + * Anyone diagnosing a failure will want to know whether the kernel 74 + * supports x32. Tell them. 75 + */ 76 + printf("\tChecking for x32..."); 77 + fflush(stdout); 78 + if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) { 79 + printf(" supported\n"); 80 + } else if (errno == ENOSYS) { 81 + printf(" not supported\n"); 82 + } else { 83 + printf(" confused\n"); 84 + } 85 + 86 + test_x32_without_x32_bit(); 87 + 88 + return nerrs ? 1 : 0; 89 + }