Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'work.set_fs' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull initial set_fs() removal from Al Viro:
"Christoph's set_fs base series + fixups"

* 'work.set_fs' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
fs: Allow a NULL pos pointer to __kernel_read
fs: Allow a NULL pos pointer to __kernel_write
powerpc: remove address space overrides using set_fs()
powerpc: use non-set_fs based maccess routines
x86: remove address space overrides using set_fs()
x86: make TASK_SIZE_MAX usable from assembly code
x86: move PAGE_OFFSET, TASK_SIZE & friends to page_{32,64}_types.h
lkdtm: remove set_fs-based tests
test_bitmap: remove user bitmap tests
uaccess: add infrastructure for kernel builds with set_fs()
fs: don't allow splice read/write without explicit ops
fs: don't allow kernel reads and writes without iter ops
sysctl: Convert to iter interfaces
proc: add a read_iter method to proc proc_ops
proc: cleanup the compat vs no compat file ops
proc: remove a level of indentation in proc_get_inode

+346 -495
+3
arch/Kconfig
··· 24 24 config HAVE_IMA_KEXEC 25 25 bool 26 26 27 + config SET_FS 28 + bool 29 + 27 30 config HOTPLUG_SMT 28 31 bool 29 32
+1
arch/alpha/Kconfig
··· 39 39 select OLD_SIGSUSPEND 40 40 select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 41 41 select MMU_GATHER_NO_RANGE 42 + select SET_FS 42 43 help 43 44 The Alpha is a 64-bit general-purpose processor designed and 44 45 marketed by the Digital Equipment Corporation of blessed memory,
+1
arch/arc/Kconfig
··· 48 48 select PCI_SYSCALL if PCI 49 49 select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING 50 50 select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 51 + select SET_FS 51 52 52 53 config ARCH_HAS_CACHE_LINE_SIZE 53 54 def_bool y
+1
arch/arm/Kconfig
··· 120 120 select PCI_SYSCALL if PCI 121 121 select PERF_USE_VMALLOC 122 122 select RTC_LIB 123 + select SET_FS 123 124 select SYS_SUPPORTS_APM_EMULATION 124 125 # Above selects are sorted alphabetically; please add new ones 125 126 # according to that. Thanks.
+1
arch/arm64/Kconfig
··· 194 194 select PCI_SYSCALL if PCI 195 195 select POWER_RESET 196 196 select POWER_SUPPLY 197 + select SET_FS 197 198 select SPARSE_IRQ 198 199 select SWIOTLB 199 200 select SYSCTL_EXCEPTION_TRACE
+1
arch/c6x/Kconfig
··· 22 22 select GENERIC_CLOCKEVENTS 23 23 select MODULES_USE_ELF_RELA 24 24 select MMU_GATHER_NO_RANGE if MMU 25 + select SET_FS 25 26 26 27 config MMU 27 28 def_bool n
+1
arch/csky/Kconfig
··· 78 78 select PCI_DOMAINS_GENERIC if PCI 79 79 select PCI_SYSCALL if PCI 80 80 select PCI_MSI if PCI 81 + select SET_FS 81 82 82 83 config LOCKDEP_SUPPORT 83 84 def_bool y
+1
arch/h8300/Kconfig
··· 25 25 select HAVE_ARCH_KGDB 26 26 select HAVE_ARCH_HASH 27 27 select CPU_NO_EFFICIENT_FFS 28 + select SET_FS 28 29 select UACCESS_MEMCPY 29 30 30 31 config CPU_BIG_ENDIAN
+1
arch/hexagon/Kconfig
··· 31 31 select GENERIC_CLOCKEVENTS_BROADCAST 32 32 select MODULES_USE_ELF_RELA 33 33 select GENERIC_CPU_DEVICES 34 + select SET_FS 34 35 help 35 36 Qualcomm Hexagon is a processor architecture designed for high 36 37 performance and low power across a wide variety of applications.
+1
arch/ia64/Kconfig
··· 56 56 select NEED_SG_DMA_LENGTH 57 57 select NUMA if !FLATMEM 58 58 select PCI_MSI_ARCH_FALLBACKS if PCI_MSI 59 + select SET_FS 59 60 default y 60 61 help 61 62 The Itanium Processor Family is Intel's 64-bit successor to
+1
arch/m68k/Kconfig
··· 31 31 select NO_DMA if !MMU && !COLDFIRE 32 32 select OLD_SIGACTION 33 33 select OLD_SIGSUSPEND3 34 + select SET_FS 34 35 select UACCESS_MEMCPY if !MMU 35 36 select VIRT_TO_BUS 36 37
+1
arch/microblaze/Kconfig
··· 47 47 select CPU_NO_EFFICIENT_FFS 48 48 select MMU_GATHER_NO_RANGE if MMU 49 49 select SPARSE_IRQ 50 + select SET_FS 50 51 51 52 # Endianness selection 52 53 choice
+1
arch/mips/Kconfig
··· 88 88 select PERF_USE_VMALLOC 89 89 select PCI_MSI_ARCH_FALLBACKS if PCI_MSI 90 90 select RTC_LIB 91 + select SET_FS 91 92 select SYSCTL_EXCEPTION_TRACE 92 93 select VIRT_TO_BUS 93 94
+1
arch/nds32/Kconfig
··· 48 48 select HAVE_FUNCTION_GRAPH_TRACER 49 49 select HAVE_FTRACE_MCOUNT_RECORD 50 50 select HAVE_DYNAMIC_FTRACE 51 + select SET_FS 51 52 help 52 53 Andes(nds32) Linux support. 53 54
+1
arch/nios2/Kconfig
··· 27 27 select USB_ARCH_HAS_HCD if USB_SUPPORT 28 28 select CPU_NO_EFFICIENT_FFS 29 29 select MMU_GATHER_NO_RANGE if MMU 30 + select SET_FS 30 31 31 32 config GENERIC_CSUM 32 33 def_bool y
+1
arch/openrisc/Kconfig
··· 39 39 select ARCH_WANT_FRAME_POINTERS 40 40 select GENERIC_IRQ_MULTI_HANDLER 41 41 select MMU_GATHER_NO_RANGE if MMU 42 + select SET_FS 42 43 43 44 config CPU_BIG_ENDIAN 44 45 def_bool y
+1
arch/parisc/Kconfig
··· 63 63 select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE 64 64 select HAVE_KPROBES_ON_FTRACE 65 65 select HAVE_DYNAMIC_FTRACE_WITH_REGS 66 + select SET_FS 66 67 67 68 help 68 69 The PA-RISC microprocessor is designed by Hewlett-Packard and used
-7
arch/powerpc/include/asm/processor.h
··· 83 83 void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); 84 84 void release_thread(struct task_struct *); 85 85 86 - typedef struct { 87 - unsigned long seg; 88 - } mm_segment_t; 89 - 90 86 #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET] 91 87 #define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET] 92 88 ··· 144 148 unsigned long ksp_vsid; 145 149 #endif 146 150 struct pt_regs *regs; /* Pointer to saved register state */ 147 - mm_segment_t addr_limit; /* for get_fs() validation */ 148 151 #ifdef CONFIG_BOOKE 149 152 /* BookE base exception scratch space; align on cacheline */ 150 153 unsigned long normsave[8] ____cacheline_aligned; ··· 291 296 #define INIT_THREAD { \ 292 297 .ksp = INIT_SP, \ 293 298 .ksp_limit = INIT_SP_LIMIT, \ 294 - .addr_limit = KERNEL_DS, \ 295 299 .pgdir = swapper_pg_dir, \ 296 300 .fpexc_mode = MSR_FE0 | MSR_FE1, \ 297 301 SPEFSCR_INIT \ ··· 298 304 #else 299 305 #define INIT_THREAD { \ 300 306 .ksp = INIT_SP, \ 301 - .addr_limit = KERNEL_DS, \ 302 307 .fpexc_mode = 0, \ 303 308 } 304 309 #endif
+1 -4
arch/powerpc/include/asm/thread_info.h
··· 90 90 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ 91 91 #define TIF_SIGPENDING 1 /* signal pending */ 92 92 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ 93 - #define TIF_FSCHECK 3 /* Check FS is USER_DS on return */ 94 93 #define TIF_SYSCALL_EMU 4 /* syscall emulation active */ 95 94 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ 96 95 #define TIF_PATCH_PENDING 6 /* pending live patching update */ ··· 129 130 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) 130 131 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) 131 132 #define _TIF_NOHZ (1<<TIF_NOHZ) 132 - #define _TIF_FSCHECK (1<<TIF_FSCHECK) 133 133 #define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) 134 134 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 135 135 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ ··· 136 138 137 139 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ 138 140 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 139 - _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ 140 - _TIF_FSCHECK) 141 + _TIF_RESTORE_TM | _TIF_PATCH_PENDING) 141 142 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) 142 143 143 144 /* Bits in local_flags */
+21 -46
arch/powerpc/include/asm/uaccess.h
··· 8 8 #include <asm/extable.h> 9 9 #include <asm/kup.h> 10 10 11 - /* 12 - * The fs value determines whether argument validity checking should be 13 - * performed or not. If get_fs() == USER_DS, checking is performed, with 14 - * get_fs() == KERNEL_DS, checking is bypassed. 15 - * 16 - * For historical reasons, these macros are grossly misnamed. 17 - * 18 - * The fs/ds values are now the highest legal address in the "segment". 19 - * This simplifies the checking in the routines below. 20 - */ 21 - 22 - #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) 23 - 24 - #define KERNEL_DS MAKE_MM_SEG(~0UL) 25 11 #ifdef __powerpc64__ 26 12 /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */ 27 - #define USER_DS MAKE_MM_SEG(TASK_SIZE_USER64 - 1) 13 + #define TASK_SIZE_MAX TASK_SIZE_USER64 28 14 #else 29 - #define USER_DS MAKE_MM_SEG(TASK_SIZE - 1) 15 + #define TASK_SIZE_MAX TASK_SIZE 30 16 #endif 31 17 32 - #define get_fs() (current->thread.addr_limit) 33 - 34 - static inline void set_fs(mm_segment_t fs) 18 + static inline bool __access_ok(unsigned long addr, unsigned long size) 35 19 { 36 - current->thread.addr_limit = fs; 37 - /* On user-mode return check addr_limit (fs) is correct */ 38 - set_thread_flag(TIF_FSCHECK); 20 + return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; 39 21 } 40 - 41 - #define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) 42 - #define user_addr_max() (get_fs().seg) 43 - 44 - #ifdef __powerpc64__ 45 - /* 46 - * This check is sufficient because there is a large enough 47 - * gap between user addresses and the kernel addresses 48 - */ 49 - #define __access_ok(addr, size, segment) \ 50 - (((addr) <= (segment).seg) && ((size) <= (segment).seg)) 51 - 52 - #else 53 - 54 - static inline int __access_ok(unsigned long addr, unsigned long size, 55 - mm_segment_t seg) 56 - { 57 - if (addr > seg.seg) 58 - return 0; 59 - return (size == 0 || size - 1 <= seg.seg - addr); 60 - } 61 - 62 - #endif 63 22 64 23 #define access_ok(addr, size) \ 65 24 (__chk_user_ptr(addr), \ 66 - __access_ok((__force unsigned long)(addr), (size), get_fs())) 25 + __access_ok((unsigned long)(addr), (size))) 67 26 68 27 /* 69 28 * These are the main single-value transfer routines. They automatically ··· 562 603 if (_len & 1) \ 563 604 __put_user_goto(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e);\ 564 605 } while (0) 606 + 607 + #define HAVE_GET_KERNEL_NOFAULT 608 + 609 + #define __get_kernel_nofault(dst, src, type, err_label) \ 610 + do { \ 611 + int __kr_err; \ 612 + \ 613 + __get_user_size_allowed(*((type *)(dst)), (__force type __user *)(src),\ 614 + sizeof(type), __kr_err); \ 615 + if (unlikely(__kr_err)) \ 616 + goto err_label; \ 617 + } while (0) 618 + 619 + #define __put_kernel_nofault(dst, src, type, err_label) \ 620 + __put_user_size_goto(*((type *)(src)), \ 621 + (__force type __user *)(dst), sizeof(type), err_label) 565 622 566 623 #endif /* _ARCH_POWERPC_UACCESS_H */
-3
arch/powerpc/kernel/signal.c
··· 312 312 { 313 313 user_exit(); 314 314 315 - /* Check valid addr_limit, TIF check is done there */ 316 - addr_limit_user_check(); 317 - 318 315 if (thread_info_flags & _TIF_UPROBE) 319 316 uprobe_notify_resume(regs); 320 317
+3 -3
arch/powerpc/lib/sstep.c
··· 108 108 { 109 109 if (!user_mode(regs)) 110 110 return 1; 111 - if (__access_ok(ea, nb, USER_DS)) 111 + if (__access_ok(ea, nb)) 112 112 return 1; 113 - if (__access_ok(ea, 1, USER_DS)) 113 + if (__access_ok(ea, 1)) 114 114 /* Access overlaps the end of the user region */ 115 - regs->dar = USER_DS.seg; 115 + regs->dar = TASK_SIZE_MAX - 1; 116 116 else 117 117 regs->dar = ea; 118 118 return 0;
+1
arch/riscv/Kconfig
··· 88 88 select SPARSE_IRQ 89 89 select SYSCTL_EXCEPTION_TRACE 90 90 select THREAD_INFO_IN_TASK 91 + select SET_FS 91 92 92 93 config ARCH_MMAP_RND_BITS_MIN 93 94 default 18 if 64BIT
+1
arch/s390/Kconfig
··· 191 191 select PCI_DOMAINS if PCI 192 192 select PCI_MSI if PCI 193 193 select PCI_MSI_ARCH_FALLBACKS if PCI_MSI 194 + select SET_FS 194 195 select SPARSE_IRQ 195 196 select SYSCTL_EXCEPTION_TRACE 196 197 select THREAD_INFO_IN_TASK
+1
arch/sh/Kconfig
··· 71 71 select PERF_EVENTS 72 72 select PERF_USE_VMALLOC 73 73 select RTC_LIB 74 + select SET_FS 74 75 select SPARSE_IRQ 75 76 help 76 77 The SuperH is a RISC processor targeted for use in embedded systems
+1
arch/sparc/Kconfig
··· 51 51 select LOCKDEP_SMALL if LOCKDEP 52 52 select NEED_DMA_MAP_STATE 53 53 select NEED_SG_DMA_LENGTH 54 + select SET_FS 54 55 55 56 config SPARC32 56 57 def_bool !64BIT
+1
arch/um/Kconfig
··· 19 19 select GENERIC_CPU_DEVICES 20 20 select GENERIC_CLOCKEVENTS 21 21 select HAVE_GCC_PLUGINS 22 + select SET_FS 22 23 select TTY # Needed for line.c 23 24 24 25 config MMU
-1
arch/x86/ia32/ia32_aout.c
··· 239 239 (regs)->ss = __USER32_DS; 240 240 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 241 241 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 242 - set_fs(USER_DS); 243 242 return 0; 244 243 } 245 244
+11
arch/x86/include/asm/page_32_types.h
··· 42 42 #endif /* CONFIG_X86_PAE */ 43 43 44 44 /* 45 + * User space process size: 3GB (default). 46 + */ 47 + #define IA32_PAGE_OFFSET __PAGE_OFFSET 48 + #define TASK_SIZE __PAGE_OFFSET 49 + #define TASK_SIZE_LOW TASK_SIZE 50 + #define TASK_SIZE_MAX TASK_SIZE 51 + #define DEFAULT_MAP_WINDOW TASK_SIZE 52 + #define STACK_TOP TASK_SIZE 53 + #define STACK_TOP_MAX STACK_TOP 54 + 55 + /* 45 56 * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) 46 57 */ 47 58 #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+38
arch/x86/include/asm/page_64_types.h
··· 60 60 #endif 61 61 62 62 /* 63 + * User space process size. This is the first address outside the user range. 64 + * There are a few constraints that determine this: 65 + * 66 + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical 67 + * address, then that syscall will enter the kernel with a 68 + * non-canonical return address, and SYSRET will explode dangerously. 69 + * We avoid this particular problem by preventing anything executable 70 + * from being mapped at the maximum canonical address. 71 + * 72 + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the 73 + * CPUs malfunction if they execute code from the highest canonical page. 74 + * They'll speculate right off the end of the canonical space, and 75 + * bad things happen. This is worked around in the same way as the 76 + * Intel problem. 77 + * 78 + * With page table isolation enabled, we map the LDT in ... [stay tuned] 79 + */ 80 + #define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 81 + 82 + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 83 + 84 + /* This decides where the kernel will search for a free chunk of vm 85 + * space during mmap's. 86 + */ 87 + #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 88 + 0xc0000000 : 0xFFFFe000) 89 + 90 + #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ 91 + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) 92 + #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 93 + IA32_PAGE_OFFSET : TASK_SIZE_MAX) 94 + #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 95 + IA32_PAGE_OFFSET : TASK_SIZE_MAX) 96 + 97 + #define STACK_TOP TASK_SIZE_LOW 98 + #define STACK_TOP_MAX TASK_SIZE_MAX 99 + 100 + /* 63 101 * Maximum kernel image size is limited to 1 GiB, due to the fixmap living 64 102 * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). 65 103 *
+1 -59
arch/x86/include/asm/processor.h
··· 482 482 483 483 struct perf_event; 484 484 485 - typedef struct { 486 - unsigned long seg; 487 - } mm_segment_t; 488 - 489 485 struct thread_struct { 490 486 /* Cached TLS descriptors: */ 491 487 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; ··· 533 537 * interrupts. 534 538 */ 535 539 unsigned long iopl_emul; 536 - 537 - mm_segment_t addr_limit; 538 540 539 541 unsigned int sig_on_uaccess_err:1; 540 542 ··· 777 783 }) 778 784 779 785 #ifdef CONFIG_X86_32 780 - /* 781 - * User space process size: 3GB (default). 782 - */ 783 - #define IA32_PAGE_OFFSET PAGE_OFFSET 784 - #define TASK_SIZE PAGE_OFFSET 785 - #define TASK_SIZE_LOW TASK_SIZE 786 - #define TASK_SIZE_MAX TASK_SIZE 787 - #define DEFAULT_MAP_WINDOW TASK_SIZE 788 - #define STACK_TOP TASK_SIZE 789 - #define STACK_TOP_MAX STACK_TOP 790 - 791 786 #define INIT_THREAD { \ 792 787 .sp0 = TOP_OF_INIT_STACK, \ 793 788 .sysenter_cs = __KERNEL_CS, \ 794 - .addr_limit = KERNEL_DS, \ 795 789 } 796 790 797 791 #define KSTK_ESP(task) (task_pt_regs(task)->sp) 798 792 799 793 #else 800 - /* 801 - * User space process size. This is the first address outside the user range. 802 - * There are a few constraints that determine this: 803 - * 804 - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical 805 - * address, then that syscall will enter the kernel with a 806 - * non-canonical return address, and SYSRET will explode dangerously. 807 - * We avoid this particular problem by preventing anything executable 808 - * from being mapped at the maximum canonical address. 809 - * 810 - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the 811 - * CPUs malfunction if they execute code from the highest canonical page. 812 - * They'll speculate right off the end of the canonical space, and 813 - * bad things happen. This is worked around in the same way as the 814 - * Intel problem. 815 - * 816 - * With page table isolation enabled, we map the LDT in ... [stay tuned] 817 - */ 818 - #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 819 - 820 - #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 821 - 822 - /* This decides where the kernel will search for a free chunk of vm 823 - * space during mmap's. 824 - */ 825 - #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 826 - 0xc0000000 : 0xFFFFe000) 827 - 828 - #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ 829 - IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) 830 - #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 831 - IA32_PAGE_OFFSET : TASK_SIZE_MAX) 832 - #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 833 - IA32_PAGE_OFFSET : TASK_SIZE_MAX) 834 - 835 - #define STACK_TOP TASK_SIZE_LOW 836 - #define STACK_TOP_MAX TASK_SIZE_MAX 837 - 838 - #define INIT_THREAD { \ 839 - .addr_limit = KERNEL_DS, \ 840 - } 794 + #define INIT_THREAD { } 841 795 842 796 extern unsigned long KSTK_ESP(struct task_struct *task); 843 797
-2
arch/x86/include/asm/thread_info.h
··· 102 102 #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ 103 103 #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ 104 104 #define TIF_X32 30 /* 32-bit native x86-64 binary */ 105 - #define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ 106 105 107 106 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 108 107 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) ··· 130 131 #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 131 132 #define _TIF_ADDR32 (1 << TIF_ADDR32) 132 133 #define _TIF_X32 (1 << TIF_X32) 133 - #define _TIF_FSCHECK (1 << TIF_FSCHECK) 134 134 135 135 /* flags to check in __switch_to() */ 136 136 #define _TIF_WORK_CTXSW_BASE \
+1 -25
arch/x86/include/asm/uaccess.h
··· 13 13 #include <asm/extable.h> 14 14 15 15 /* 16 - * The fs value determines whether argument validity checking should be 17 - * performed or not. If get_fs() == USER_DS, checking is performed, with 18 - * get_fs() == KERNEL_DS, checking is bypassed. 19 - * 20 - * For historical reasons, these macros are grossly misnamed. 21 - */ 22 - 23 - #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) 24 - 25 - #define KERNEL_DS MAKE_MM_SEG(-1UL) 26 - #define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) 27 - 28 - #define get_fs() (current->thread.addr_limit) 29 - static inline void set_fs(mm_segment_t fs) 30 - { 31 - current->thread.addr_limit = fs; 32 - /* On user-mode return, check fs is correct */ 33 - set_thread_flag(TIF_FSCHECK); 34 - } 35 - 36 - #define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) 37 - #define user_addr_max() (current->thread.addr_limit.seg) 38 - 39 - /* 40 16 * Test whether a block of memory is a valid user space address. 41 17 * Returns 0 if the range is valid, nonzero otherwise. 42 18 */ ··· 69 93 #define access_ok(addr, size) \ 70 94 ({ \ 71 95 WARN_ON_IN_IRQ(); \ 72 - likely(!__range_not_ok(addr, size, user_addr_max())); \ 96 + likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ 73 97 }) 74 98 75 99 extern int __get_user_1(void);
-3
arch/x86/kernel/asm-offsets.c
··· 38 38 #endif 39 39 40 40 BLANK(); 41 - OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); 42 - 43 - BLANK(); 44 41 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 45 42 46 43 BLANK();
+24 -23
arch/x86/lib/getuser.S
··· 37 37 38 38 #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC 39 39 40 + #ifdef CONFIG_X86_5LEVEL 41 + #define LOAD_TASK_SIZE_MINUS_N(n) \ 42 + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ 43 + __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57 44 + #else 45 + #define LOAD_TASK_SIZE_MINUS_N(n) \ 46 + mov $(TASK_SIZE_MAX - (n)),%_ASM_DX 47 + #endif 48 + 40 49 .text 41 50 SYM_FUNC_START(__get_user_1) 42 - mov PER_CPU_VAR(current_task), %_ASM_DX 43 - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX 51 + LOAD_TASK_SIZE_MINUS_N(0) 52 + cmp %_ASM_DX,%_ASM_AX 44 53 jae bad_get_user 45 54 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ 46 55 and %_ASM_DX, %_ASM_AX ··· 62 53 EXPORT_SYMBOL(__get_user_1) 63 54 64 55 SYM_FUNC_START(__get_user_2) 65 - add $1,%_ASM_AX 66 - jc bad_get_user 67 - mov PER_CPU_VAR(current_task), %_ASM_DX 68 - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX 56 + LOAD_TASK_SIZE_MINUS_N(1) 57 + cmp %_ASM_DX,%_ASM_AX 69 58 jae bad_get_user 70 59 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ 71 60 and %_ASM_DX, %_ASM_AX 72 61 ASM_STAC 73 - 2: movzwl -1(%_ASM_AX),%edx 62 + 2: movzwl (%_ASM_AX),%edx 74 63 xor %eax,%eax 75 64 ASM_CLAC 76 65 ret ··· 76 69 EXPORT_SYMBOL(__get_user_2) 77 70 78 71 SYM_FUNC_START(__get_user_4) 79 - add $3,%_ASM_AX 80 - jc bad_get_user 81 - mov PER_CPU_VAR(current_task), %_ASM_DX 82 - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX 72 + LOAD_TASK_SIZE_MINUS_N(3) 73 + cmp %_ASM_DX,%_ASM_AX 83 74 jae bad_get_user 84 75 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ 85 76 and %_ASM_DX, %_ASM_AX 86 77 ASM_STAC 87 - 3: movl -3(%_ASM_AX),%edx 78 + 3: movl (%_ASM_AX),%edx 88 79 xor %eax,%eax 89 80 ASM_CLAC 90 81 ret ··· 91 86 92 87 SYM_FUNC_START(__get_user_8) 93 88 #ifdef CONFIG_X86_64 94 - add $7,%_ASM_AX 95 - jc bad_get_user 96 - mov PER_CPU_VAR(current_task), %_ASM_DX 97 - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX 89 + LOAD_TASK_SIZE_MINUS_N(7) 90 + cmp %_ASM_DX,%_ASM_AX 98 91 jae bad_get_user 99 92 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ 100 93 and %_ASM_DX, %_ASM_AX 101 94 ASM_STAC 102 - 4: movq -7(%_ASM_AX),%rdx 95 + 4: movq (%_ASM_AX),%rdx 103 96 xor %eax,%eax 104 97 ASM_CLAC 105 98 ret 106 99 #else 107 - add $7,%_ASM_AX 108 - jc bad_get_user_8 109 - mov PER_CPU_VAR(current_task), %_ASM_DX 110 - cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX 100 + LOAD_TASK_SIZE_MINUS_N(7) 101 + cmp %_ASM_DX,%_ASM_AX 111 102 jae bad_get_user_8 112 103 sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ 113 104 and %_ASM_DX, %_ASM_AX 114 105 ASM_STAC 115 - 4: movl -7(%_ASM_AX),%edx 116 - 5: movl -3(%_ASM_AX),%ecx 106 + 4: movl (%_ASM_AX),%edx 107 + 5: movl 4(%_ASM_AX),%ecx 117 108 xor %eax,%eax 118 109 ASM_CLAC 119 110 ret
+13 -12
arch/x86/lib/putuser.S
··· 33 33 * as they get called from within inline assembly. 34 34 */ 35 35 36 - #define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX 36 + #ifdef CONFIG_X86_5LEVEL 37 + #define LOAD_TASK_SIZE_MINUS_N(n) \ 38 + ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ 39 + __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57 40 + #else 41 + #define LOAD_TASK_SIZE_MINUS_N(n) \ 42 + mov $(TASK_SIZE_MAX - (n)),%_ASM_BX 43 + #endif 37 44 38 45 .text 39 46 SYM_FUNC_START(__put_user_1) 40 - ENTER 41 - cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX 47 + LOAD_TASK_SIZE_MINUS_N(0) 48 + cmp %_ASM_BX,%_ASM_CX 42 49 jae .Lbad_put_user 43 50 SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) 44 51 ASM_STAC ··· 58 51 EXPORT_SYMBOL(__put_user_nocheck_1) 59 52 60 53 SYM_FUNC_START(__put_user_2) 61 - ENTER 62 - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX 63 - sub $1,%_ASM_BX 54 + LOAD_TASK_SIZE_MINUS_N(1) 64 55 cmp %_ASM_BX,%_ASM_CX 65 56 jae .Lbad_put_user 66 57 SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) ··· 72 67 EXPORT_SYMBOL(__put_user_nocheck_2) 73 68 74 69 SYM_FUNC_START(__put_user_4) 75 - ENTER 76 - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX 77 - sub $3,%_ASM_BX 70 + LOAD_TASK_SIZE_MINUS_N(3) 78 71 cmp %_ASM_BX,%_ASM_CX 79 72 jae .Lbad_put_user 80 73 SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) ··· 86 83 EXPORT_SYMBOL(__put_user_nocheck_4) 87 84 88 85 SYM_FUNC_START(__put_user_8) 89 - ENTER 90 - mov TASK_addr_limit(%_ASM_BX),%_ASM_BX 91 - sub $7,%_ASM_BX 86 + LOAD_TASK_SIZE_MINUS_N(7) 92 87 cmp %_ASM_BX,%_ASM_CX 93 88 jae .Lbad_put_user 94 89 SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
+1
arch/xtensa/Kconfig
··· 41 41 select IRQ_DOMAIN 42 42 select MODULES_USE_ELF_RELA 43 43 select PERF_USE_VMALLOC 44 + select SET_FS 44 45 select VIRT_TO_BUS 45 46 help 46 47 Xtensa processors are 32-bit RISC machines designed by Tensilica
-10
drivers/misc/lkdtm/bugs.c
··· 312 312 pr_err("list_del() corruption not detected!\n"); 313 313 } 314 314 315 - /* Test if unbalanced set_fs(KERNEL_DS)/set_fs(USER_DS) check exists. */ 316 - void lkdtm_CORRUPT_USER_DS(void) 317 - { 318 - pr_info("setting bad task size limit\n"); 319 - set_fs(KERNEL_DS); 320 - 321 - /* Make sure we do not keep running with a KERNEL_DS! */ 322 - force_sig(SIGKILL); 323 - } 324 - 325 315 /* Test that VMAP_STACK is actually allocating with a leading guard page */ 326 316 void lkdtm_STACK_GUARD_PAGE_LEADING(void) 327 317 {
-2
drivers/misc/lkdtm/core.c
··· 112 112 CRASHTYPE(CORRUPT_STACK_STRONG), 113 113 CRASHTYPE(CORRUPT_LIST_ADD), 114 114 CRASHTYPE(CORRUPT_LIST_DEL), 115 - CRASHTYPE(CORRUPT_USER_DS), 116 115 CRASHTYPE(STACK_GUARD_PAGE_LEADING), 117 116 CRASHTYPE(STACK_GUARD_PAGE_TRAILING), 118 117 CRASHTYPE(UNSET_SMEP), ··· 171 172 CRASHTYPE(USERCOPY_STACK_FRAME_FROM), 172 173 CRASHTYPE(USERCOPY_STACK_BEYOND), 173 174 CRASHTYPE(USERCOPY_KERNEL), 174 - CRASHTYPE(USERCOPY_KERNEL_DS), 175 175 CRASHTYPE(STACKLEAK_ERASING), 176 176 CRASHTYPE(CFI_FORWARD_PROTO), 177 177 #ifdef CONFIG_X86_32
-2
drivers/misc/lkdtm/lkdtm.h
··· 27 27 void lkdtm_ARRAY_BOUNDS(void); 28 28 void lkdtm_CORRUPT_LIST_ADD(void); 29 29 void lkdtm_CORRUPT_LIST_DEL(void); 30 - void lkdtm_CORRUPT_USER_DS(void); 31 30 void lkdtm_STACK_GUARD_PAGE_LEADING(void); 32 31 void lkdtm_STACK_GUARD_PAGE_TRAILING(void); 33 32 void lkdtm_UNSET_SMEP(void); ··· 95 96 void lkdtm_USERCOPY_STACK_FRAME_FROM(void); 96 97 void lkdtm_USERCOPY_STACK_BEYOND(void); 97 98 void lkdtm_USERCOPY_KERNEL(void); 98 - void lkdtm_USERCOPY_KERNEL_DS(void); 99 99 100 100 /* lkdtm_stackleak.c */ 101 101 void lkdtm_STACKLEAK_ERASING(void);
-15
drivers/misc/lkdtm/usercopy.c
··· 325 325 vm_munmap(user_addr, PAGE_SIZE); 326 326 } 327 327 328 - void lkdtm_USERCOPY_KERNEL_DS(void) 329 - { 330 - char __user *user_ptr = 331 - (char __user *)(0xFUL << (sizeof(unsigned long) * 8 - 4)); 332 - mm_segment_t old_fs = get_fs(); 333 - char buf[10] = {0}; 334 - 335 - pr_info("attempting copy_to_user() to noncanonical address: %px\n", 336 - user_ptr); 337 - set_fs(KERNEL_DS); 338 - if (copy_to_user(user_ptr, buf, sizeof(buf)) == 0) 339 - pr_err("copy_to_user() to noncanonical address succeeded!?\n"); 340 - set_fs(old_fs); 341 - } 342 - 343 328 void __init lkdtm_usercopy_init(void) 344 329 { 345 330 /* Prepare cache that lacks SLAB_USERCOPY flag. */
+83 -36
fs/proc/inode.c
··· 297 297 return rv; 298 298 } 299 299 300 + static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) 301 + { 302 + struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); 303 + ssize_t ret; 304 + 305 + if (pde_is_permanent(pde)) 306 + return pde->proc_ops->proc_read_iter(iocb, iter); 307 + 308 + if (!use_pde(pde)) 309 + return -EIO; 310 + ret = pde->proc_ops->proc_read_iter(iocb, iter); 311 + unuse_pde(pde); 312 + return ret; 313 + } 314 + 300 315 static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) 301 316 { 302 317 typeof_member(struct proc_ops, proc_read) read; ··· 587 572 .write = proc_reg_write, 588 573 .poll = proc_reg_poll, 589 574 .unlocked_ioctl = proc_reg_unlocked_ioctl, 590 - #ifdef CONFIG_COMPAT 591 - .compat_ioctl = proc_reg_compat_ioctl, 592 - #endif 575 + .mmap = proc_reg_mmap, 576 + .get_unmapped_area = proc_reg_get_unmapped_area, 577 + .open = proc_reg_open, 578 + .release = proc_reg_release, 579 + }; 580 + 581 + static const struct file_operations proc_iter_file_ops = { 582 + .llseek = proc_reg_llseek, 583 + .read_iter = proc_reg_read_iter, 584 + .write = proc_reg_write, 585 + .poll = proc_reg_poll, 586 + .unlocked_ioctl = proc_reg_unlocked_ioctl, 593 587 .mmap = proc_reg_mmap, 594 588 .get_unmapped_area = proc_reg_get_unmapped_area, 595 589 .open = proc_reg_open, ··· 606 582 }; 607 583 608 584 #ifdef CONFIG_COMPAT 609 - static const struct file_operations proc_reg_file_ops_no_compat = { 585 + static const struct file_operations proc_reg_file_ops_compat = { 610 586 .llseek = proc_reg_llseek, 611 587 .read = proc_reg_read, 612 588 .write = proc_reg_write, 613 589 .poll = proc_reg_poll, 614 590 .unlocked_ioctl = proc_reg_unlocked_ioctl, 591 + .compat_ioctl = proc_reg_compat_ioctl, 592 + .mmap = proc_reg_mmap, 593 + .get_unmapped_area = proc_reg_get_unmapped_area, 594 + .open = proc_reg_open, 595 + .release = proc_reg_release, 596 + }; 597 + 598 + static const struct file_operations proc_iter_file_ops_compat = { 599 + .llseek = proc_reg_llseek, 600 + .read_iter = proc_reg_read_iter, 601 + .write = proc_reg_write, 602 + .poll = proc_reg_poll, 603 + .unlocked_ioctl = proc_reg_unlocked_ioctl, 604 + .compat_ioctl = proc_reg_compat_ioctl, 615 605 .mmap = proc_reg_mmap, 616 606 .get_unmapped_area = proc_reg_get_unmapped_area, 617 607 .open = proc_reg_open, ··· 657 619 { 658 620 struct inode *inode = new_inode(sb); 659 621 660 - if (inode) { 661 - inode->i_ino = de->low_ino; 662 - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 663 - PROC_I(inode)->pde = de; 622 + if (!inode) { 623 + pde_put(de); 624 + return NULL; 625 + } 664 626 665 - if (is_empty_pde(de)) { 666 - make_empty_dir_inode(inode); 667 - return inode; 668 - } 669 - if (de->mode) { 670 - inode->i_mode = de->mode; 671 - inode->i_uid = de->uid; 672 - inode->i_gid = de->gid; 673 - } 674 - if (de->size) 675 - inode->i_size = de->size; 676 - if (de->nlink) 677 - set_nlink(inode, de->nlink); 627 + inode->i_ino = de->low_ino; 628 + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 629 + PROC_I(inode)->pde = de; 630 + if (is_empty_pde(de)) { 631 + make_empty_dir_inode(inode); 632 + return inode; 633 + } 678 634 679 - if (S_ISREG(inode->i_mode)) { 680 - inode->i_op = de->proc_iops; 635 + if (de->mode) { 636 + inode->i_mode = de->mode; 637 + inode->i_uid = de->uid; 638 + inode->i_gid = de->gid; 639 + } 640 + if (de->size) 641 + inode->i_size = de->size; 642 + if (de->nlink) 643 + set_nlink(inode, de->nlink); 644 + 645 + if (S_ISREG(inode->i_mode)) { 646 + inode->i_op = de->proc_iops; 647 + if (de->proc_ops->proc_read_iter) 648 + inode->i_fop = &proc_iter_file_ops; 649 + else 681 650 inode->i_fop = &proc_reg_file_ops; 682 651 #ifdef CONFIG_COMPAT 683 - if (!de->proc_ops->proc_compat_ioctl) { 684 - inode->i_fop = &proc_reg_file_ops_no_compat; 685 - } 652 + if (de->proc_ops->proc_compat_ioctl) { 653 + if (de->proc_ops->proc_read_iter) 654 + inode->i_fop = &proc_iter_file_ops_compat; 655 + else 656 + inode->i_fop = &proc_reg_file_ops_compat; 657 + } 686 658 #endif 687 - } else if (S_ISDIR(inode->i_mode)) { 688 - inode->i_op = de->proc_iops; 689 - inode->i_fop = de->proc_dir_ops; 690 - } else if (S_ISLNK(inode->i_mode)) { 691 - inode->i_op = de->proc_iops; 692 - inode->i_fop = NULL; 693 - } else 694 - BUG(); 695 - } else 696 - pde_put(de); 659 + } else if (S_ISDIR(inode->i_mode)) { 660 + inode->i_op = de->proc_iops; 661 + inode->i_fop = de->proc_dir_ops; 662 + } else if (S_ISLNK(inode->i_mode)) { 663 + inode->i_op = de->proc_iops; 664 + inode->i_fop = NULL; 665 + } else { 666 + BUG(); 667 + } 697 668 return inode; 698 669 }
+24 -24
fs/proc/proc_sysctl.c
··· 12 12 #include <linux/cred.h> 13 13 #include <linux/namei.h> 14 14 #include <linux/mm.h> 15 + #include <linux/uio.h> 15 16 #include <linux/module.h> 16 17 #include <linux/bpf-cgroup.h> 17 18 #include <linux/mount.h> ··· 541 540 return err; 542 541 } 543 542 544 - static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, 545 - size_t count, loff_t *ppos, int write) 543 + static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, 544 + int write) 546 545 { 547 - struct inode *inode = file_inode(filp); 546 + struct inode *inode = file_inode(iocb->ki_filp); 548 547 struct ctl_table_header *head = grab_header(inode); 549 548 struct ctl_table *table = PROC_I(inode)->sysctl_entry; 550 - void *kbuf; 549 + size_t count = iov_iter_count(iter); 550 + char *kbuf; 551 551 ssize_t error; 552 552 553 553 if (IS_ERR(head)) ··· 571 569 error = -ENOMEM; 572 570 if (count >= KMALLOC_MAX_SIZE) 573 571 goto out; 572 + kbuf = kzalloc(count + 1, GFP_KERNEL); 573 + if (!kbuf) 574 + goto out; 574 575 575 576 if (write) { 576 - kbuf = memdup_user_nul(ubuf, count); 577 - if (IS_ERR(kbuf)) { 578 - error = PTR_ERR(kbuf); 579 - goto out; 580 - } 581 - } else { 582 - kbuf = kzalloc(count, GFP_KERNEL); 583 - if (!kbuf) 584 - goto out; 577 + error = -EFAULT; 578 + if (!copy_from_iter_full(kbuf, count, iter)) 579 + goto out_free_buf; 580 + kbuf[count] = '\0'; 585 581 } 586 582 587 583 error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, 588 - ppos); 584 + &iocb->ki_pos); 589 585 if (error) 590 586 goto out_free_buf; 591 587 592 588 /* careful: calling conventions are nasty here */ 593 - error = table->proc_handler(table, write, kbuf, &count, ppos); 589 + error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); 594 590 if (error) 595 591 goto out_free_buf; 596 592 597 593 if (!write) { 598 594 error = -EFAULT; 599 - if (copy_to_user(ubuf, kbuf, count)) 595 + if (copy_to_iter(kbuf, count, iter) < count) 600 596 goto out_free_buf; 601 597 } 602 598 ··· 607 607 return error; 608 608 } 609 609 610 - static ssize_t proc_sys_read(struct file *filp, char __user *buf, 611 - size_t count, loff_t *ppos) 610 + static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) 612 611 { 613 - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); 612 + return proc_sys_call_handler(iocb, iter, 0); 614 613 } 615 614 616 - static ssize_t proc_sys_write(struct file *filp, const char __user *buf, 617 - size_t count, loff_t *ppos) 615 + static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) 618 616 { 619 - return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); 617 + return proc_sys_call_handler(iocb, iter, 1); 620 618 } 621 619 622 620 static int proc_sys_open(struct inode *inode, struct file *filp) ··· 851 853 static const struct file_operations proc_sys_file_operations = { 852 854 .open = proc_sys_open, 853 855 .poll = proc_sys_poll, 854 - .read = proc_sys_read, 855 - .write = proc_sys_write, 856 + .read_iter = proc_sys_read, 857 + .write_iter = proc_sys_write, 858 + .splice_read = generic_file_splice_read, 859 + .splice_write = iter_file_splice_write, 856 860 .llseek = default_llseek, 857 861 }; 858 862
+45 -26
fs/read_write.c
··· 419 419 return ret; 420 420 } 421 421 422 + static int warn_unsupported(struct file *file, const char *op) 423 + { 424 + pr_warn_ratelimited( 425 + "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 426 + op, file, current->pid, current->comm); 427 + return -EINVAL; 428 + } 429 + 422 430 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 423 431 { 424 - mm_segment_t old_fs = get_fs(); 432 + struct kvec iov = { 433 + .iov_base = buf, 434 + .iov_len = min_t(size_t, count, MAX_RW_COUNT), 435 + }; 436 + struct kiocb kiocb; 437 + struct iov_iter iter; 425 438 ssize_t ret; 426 439 427 440 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) 428 441 return -EINVAL; 429 442 if (!(file->f_mode & FMODE_CAN_READ)) 430 443 return -EINVAL; 444 + /* 445 + * Also fail if ->read_iter and ->read are both wired up as that 446 + * implies very convoluted semantics. 447 + */ 448 + if (unlikely(!file->f_op->read_iter || file->f_op->read)) 449 + return warn_unsupported(file, "read"); 431 450 432 - if (count > MAX_RW_COUNT) 433 - count = MAX_RW_COUNT; 434 - set_fs(KERNEL_DS); 435 - if (file->f_op->read) 436 - ret = file->f_op->read(file, (void __user *)buf, count, pos); 437 - else if (file->f_op->read_iter) 438 - ret = new_sync_read(file, (void __user *)buf, count, pos); 439 - else 440 - ret = -EINVAL; 441 - set_fs(old_fs); 451 + init_sync_kiocb(&kiocb, file); 452 + kiocb.ki_pos = pos ? *pos : 0; 453 + iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); 454 + ret = file->f_op->read_iter(&kiocb, &iter); 442 455 if (ret > 0) { 456 + if (pos) 457 + *pos = kiocb.ki_pos; 443 458 fsnotify_access(file); 444 459 add_rchar(current, ret); 445 460 } ··· 525 510 /* caller is responsible for file_start_write/file_end_write */ 526 511 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) 527 512 { 528 - mm_segment_t old_fs; 529 - const char __user *p; 513 + struct kvec iov = { 514 + .iov_base = (void *)buf, 515 + .iov_len = min_t(size_t, count, MAX_RW_COUNT), 516 + }; 517 + struct kiocb kiocb; 518 + struct iov_iter iter; 530 519 ssize_t ret; 531 520 532 521 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) 533 522 return -EBADF; 534 523 if (!(file->f_mode & FMODE_CAN_WRITE)) 535 524 return -EINVAL; 525 + /* 526 + * Also fail if ->write_iter and ->write are both wired up as that 527 + * implies very convoluted semantics. 528 + */ 529 + if (unlikely(!file->f_op->write_iter || file->f_op->write)) 530 + return warn_unsupported(file, "write"); 536 531 537 - old_fs = get_fs(); 538 - set_fs(KERNEL_DS); 539 - p = (__force const char __user *)buf; 540 - if (count > MAX_RW_COUNT) 541 - count = MAX_RW_COUNT; 542 - if (file->f_op->write) 543 - ret = file->f_op->write(file, p, count, pos); 544 - else if (file->f_op->write_iter) 545 - ret = new_sync_write(file, p, count, pos); 546 - else 547 - ret = -EINVAL; 548 - set_fs(old_fs); 532 + init_sync_kiocb(&kiocb, file); 533 + kiocb.ki_pos = pos ? *pos : 0; 534 + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); 535 + ret = file->f_op->write_iter(&kiocb, &iter); 549 536 if (ret > 0) { 537 + if (pos) 538 + *pos = kiocb.ki_pos; 550 539 fsnotify_modify(file); 551 540 add_wchar(current, ret); 552 541 } ··· 908 889 } 909 890 EXPORT_SYMBOL(vfs_iter_write); 910 891 911 - ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 892 + static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 912 893 unsigned long vlen, loff_t *pos, rwf_t flags) 913 894 { 914 895 struct iovec iovstack[UIO_FASTIOV];
+14 -116
fs/splice.c
··· 341 341 }; 342 342 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 343 343 344 - static ssize_t kernel_readv(struct file *file, const struct kvec *vec, 345 - unsigned long vlen, loff_t offset) 346 - { 347 - mm_segment_t old_fs; 348 - loff_t pos = offset; 349 - ssize_t res; 350 - 351 - old_fs = get_fs(); 352 - set_fs(KERNEL_DS); 353 - /* The cast to a user pointer is valid due to the set_fs() */ 354 - res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); 355 - set_fs(old_fs); 356 - 357 - return res; 358 - } 359 - 360 - static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, 361 - struct pipe_inode_info *pipe, size_t len, 362 - unsigned int flags) 363 - { 364 - struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; 365 - struct iov_iter to; 366 - struct page **pages; 367 - unsigned int nr_pages; 368 - unsigned int mask; 369 - size_t offset, base, copied = 0; 370 - ssize_t res; 371 - int i; 372 - 373 - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 374 - return -EAGAIN; 375 - 376 - /* 377 - * Try to keep page boundaries matching to source pagecache ones - 378 - * it probably won't be much help, but... 379 - */ 380 - offset = *ppos & ~PAGE_MASK; 381 - 382 - iov_iter_pipe(&to, READ, pipe, len + offset); 383 - 384 - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); 385 - if (res <= 0) 386 - return -ENOMEM; 387 - 388 - nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); 389 - 390 - vec = __vec; 391 - if (nr_pages > PIPE_DEF_BUFFERS) { 392 - vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL); 393 - if (unlikely(!vec)) { 394 - res = -ENOMEM; 395 - goto out; 396 - } 397 - } 398 - 399 - mask = pipe->ring_size - 1; 400 - pipe->bufs[to.head & mask].offset = offset; 401 - pipe->bufs[to.head & mask].len -= offset; 402 - 403 - for (i = 0; i < nr_pages; i++) { 404 - size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); 405 - vec[i].iov_base = page_address(pages[i]) + offset; 406 - vec[i].iov_len = this_len; 407 - len -= this_len; 408 - offset = 0; 409 - } 410 - 411 - res = kernel_readv(in, vec, nr_pages, *ppos); 412 - if (res > 0) { 413 - copied = res; 414 - *ppos += res; 415 - } 416 - 417 - if (vec != __vec) 418 - kfree(vec); 419 - out: 420 - for (i = 0; i < nr_pages; i++) 421 - put_page(pages[i]); 422 - kvfree(pages); 423 - iov_iter_advance(&to, copied); /* truncates and discards */ 424 - return res; 425 - } 426 - 427 344 /* 428 345 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 429 346 * using sendpage(). Return the number of bytes sent. ··· 724 807 725 808 EXPORT_SYMBOL(iter_file_splice_write); 726 809 727 - static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 728 - struct splice_desc *sd) 729 - { 730 - int ret; 731 - void *data; 732 - loff_t tmp = sd->pos; 733 - 734 - data = kmap(buf->page); 735 - ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 736 - kunmap(buf->page); 737 - 738 - return ret; 739 - } 740 - 741 - static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, 742 - struct file *out, loff_t *ppos, 743 - size_t len, unsigned int flags) 744 - { 745 - ssize_t ret; 746 - 747 - ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); 748 - if (ret > 0) 749 - *ppos += ret; 750 - 751 - return ret; 752 - } 753 - 754 810 /** 755 811 * generic_splice_sendpage - splice data from a pipe to a socket 756 812 * @pipe: pipe to splice from ··· 745 855 746 856 EXPORT_SYMBOL(generic_splice_sendpage); 747 857 858 + static int warn_unsupported(struct file *file, const char *op) 859 + { 860 + pr_debug_ratelimited( 861 + "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 862 + op, file, current->pid, current->comm); 863 + return -EINVAL; 864 + } 865 + 748 866 /* 749 867 * Attempt to initiate a splice from pipe to file. 750 868 */ 751 869 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 752 870 loff_t *ppos, size_t len, unsigned int flags) 753 871 { 754 - if (out->f_op->splice_write) 755 - return out->f_op->splice_write(pipe, out, ppos, len, flags); 756 - return default_file_splice_write(pipe, out, ppos, len, flags); 872 + if (unlikely(!out->f_op->splice_write)) 873 + return warn_unsupported(out, "write"); 874 + return out->f_op->splice_write(pipe, out, ppos, len, flags); 757 875 } 758 876 759 877 /* ··· 783 885 if (unlikely(len > MAX_RW_COUNT)) 784 886 len = MAX_RW_COUNT; 785 887 786 - if (in->f_op->splice_read) 787 - return in->f_op->splice_read(in, ppos, pipe, len, flags); 788 - return default_file_splice_read(in, ppos, pipe, len, flags); 888 + if (unlikely(!in->f_op->splice_read)) 889 + return warn_unsupported(in, "read"); 890 + return in->f_op->splice_read(in, ppos, pipe, len, flags); 789 891 } 790 892 791 893 /**
+1 -1
include/linux/bpf-cgroup.h
··· 136 136 137 137 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 138 138 struct ctl_table *table, int write, 139 - void **buf, size_t *pcount, loff_t *ppos, 139 + char **buf, size_t *pcount, loff_t *ppos, 140 140 enum bpf_attach_type type); 141 141 142 142 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
-2
include/linux/fs.h
··· 1894 1894 1895 1895 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1896 1896 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1897 - extern ssize_t vfs_readv(struct file *, const struct iovec __user *, 1898 - unsigned long, loff_t *, rwf_t); 1899 1897 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, 1900 1898 loff_t, size_t, unsigned int); 1901 1899 extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
+1
include/linux/proc_fs.h
··· 30 30 unsigned int proc_flags; 31 31 int (*proc_open)(struct inode *, struct file *); 32 32 ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); 33 + ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); 33 34 ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); 34 35 loff_t (*proc_lseek)(struct file *, loff_t, int); 35 36 int (*proc_release)(struct inode *, struct file *);
+18
include/linux/uaccess.h
··· 10 10 11 11 #include <asm/uaccess.h> 12 12 13 + #ifdef CONFIG_SET_FS 13 14 /* 14 15 * Force the uaccess routines to be wired up for actual userspace access, 15 16 * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone ··· 28 27 { 29 28 set_fs(oldfs); 30 29 } 30 + #else /* CONFIG_SET_FS */ 31 + typedef struct { 32 + /* empty dummy */ 33 + } mm_segment_t; 34 + 35 + #define uaccess_kernel() (false) 36 + #define user_addr_max() (TASK_SIZE_MAX) 37 + 38 + static inline mm_segment_t force_uaccess_begin(void) 39 + { 40 + return (mm_segment_t) { }; 41 + } 42 + 43 + static inline void force_uaccess_end(mm_segment_t oldfs) 44 + { 45 + } 46 + #endif /* CONFIG_SET_FS */ 31 47 32 48 /* 33 49 * Architectures should provide two primitives (raw_copy_{to,from}_user())
+1 -1
kernel/bpf/cgroup.c
··· 1226 1226 */ 1227 1227 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 1228 1228 struct ctl_table *table, int write, 1229 - void **buf, size_t *pcount, loff_t *ppos, 1229 + char **buf, size_t *pcount, loff_t *ppos, 1230 1230 enum bpf_attach_type type) 1231 1231 { 1232 1232 struct bpf_sysctl_kern ctx = {
+21 -70
lib/test_bitmap.c
··· 354 354 355 355 }; 356 356 357 - static void __init __test_bitmap_parselist(int is_user) 357 + static void __init test_bitmap_parselist(void) 358 358 { 359 359 int i; 360 360 int err; 361 361 ktime_t time; 362 362 DECLARE_BITMAP(bmap, 2048); 363 - char *mode = is_user ? "_user" : ""; 364 363 365 364 for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) { 366 365 #define ptest parselist_tests[i] 367 366 368 - if (is_user) { 369 - mm_segment_t orig_fs = get_fs(); 370 - size_t len = strlen(ptest.in); 371 - 372 - set_fs(KERNEL_DS); 373 - time = ktime_get(); 374 - err = bitmap_parselist_user((__force const char __user *)ptest.in, len, 375 - bmap, ptest.nbits); 376 - time = ktime_get() - time; 377 - set_fs(orig_fs); 378 - } else { 379 - time = ktime_get(); 380 - err = bitmap_parselist(ptest.in, bmap, ptest.nbits); 381 - time = ktime_get() - time; 382 - } 367 + time = ktime_get(); 368 + err = bitmap_parselist(ptest.in, bmap, ptest.nbits); 369 + time = ktime_get() - time; 383 370 384 371 if (err != ptest.errno) { 385 - pr_err("parselist%s: %d: input is %s, errno is %d, expected %d\n", 386 - mode, i, ptest.in, err, ptest.errno); 372 + pr_err("parselist: %d: input is %s, errno is %d, expected %d\n", 373 + i, ptest.in, err, ptest.errno); 387 374 continue; 388 375 } 389 376 390 377 if (!err && ptest.expected 391 378 && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) { 392 - pr_err("parselist%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n", 393 - mode, i, ptest.in, bmap[0], 379 + pr_err("parselist: %d: input is %s, result is 0x%lx, expected 0x%lx\n", 380 + i, ptest.in, bmap[0], 394 381 *ptest.expected); 395 382 continue; 396 383 } 397 384 398 385 if (ptest.flags & PARSE_TIME) 399 - pr_err("parselist%s: %d: input is '%s' OK, Time: %llu\n", 400 - mode, i, ptest.in, time); 386 + pr_err("parselist: %d: input is '%s' OK, Time: %llu\n", 387 + i, ptest.in, time); 401 388 402 389 #undef ptest 403 390 } ··· 430 443 #undef step 431 444 }; 432 445 433 - static void __init __test_bitmap_parse(int is_user) 446 + static void __init test_bitmap_parse(void) 434 447 { 435 448 int i; 436 449 int err; 437 450 ktime_t time; 438 451 DECLARE_BITMAP(bmap, 2048); 439 - char *mode = is_user ? "_user" : ""; 440 452 441 453 for (i = 0; i < ARRAY_SIZE(parse_tests); i++) { 442 454 struct test_bitmap_parselist test = parse_tests[i]; 455 + size_t len = test.flags & NO_LEN ? UINT_MAX : strlen(test.in); 443 456 444 - if (is_user) { 445 - size_t len = strlen(test.in); 446 - mm_segment_t orig_fs = get_fs(); 447 - 448 - set_fs(KERNEL_DS); 449 - time = ktime_get(); 450 - err = bitmap_parse_user((__force const char __user *)test.in, len, 451 - bmap, test.nbits); 452 - time = ktime_get() - time; 453 - set_fs(orig_fs); 454 - } else { 455 - size_t len = test.flags & NO_LEN ? 456 - UINT_MAX : strlen(test.in); 457 - time = ktime_get(); 458 - err = bitmap_parse(test.in, len, bmap, test.nbits); 459 - time = ktime_get() - time; 460 - } 457 + time = ktime_get(); 458 + err = bitmap_parse(test.in, len, bmap, test.nbits); 459 + time = ktime_get() - time; 461 460 462 461 if (err != test.errno) { 463 - pr_err("parse%s: %d: input is %s, errno is %d, expected %d\n", 464 - mode, i, test.in, err, test.errno); 462 + pr_err("parse: %d: input is %s, errno is %d, expected %d\n", 463 + i, test.in, err, test.errno); 465 464 continue; 466 465 } 467 466 468 467 if (!err && test.expected 469 468 && !__bitmap_equal(bmap, test.expected, test.nbits)) { 470 - pr_err("parse%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n", 471 - mode, i, test.in, bmap[0], 469 + pr_err("parse: %d: input is %s, result is 0x%lx, expected 0x%lx\n", 470 + i, test.in, bmap[0], 472 471 *test.expected); 473 472 continue; 474 473 } 475 474 476 475 if (test.flags & PARSE_TIME) 477 - pr_err("parse%s: %d: input is '%s' OK, Time: %llu\n", 478 - mode, i, test.in, time); 476 + pr_err("parse: %d: input is '%s' OK, Time: %llu\n", 477 + i, test.in, time); 479 478 } 480 - } 481 - 482 - static void __init test_bitmap_parselist(void) 483 - { 484 - __test_bitmap_parselist(0); 485 - } 486 - 487 - static void __init test_bitmap_parselist_user(void) 488 - { 489 - __test_bitmap_parselist(1); 490 - } 491 - 492 - static void __init test_bitmap_parse(void) 493 - { 494 - __test_bitmap_parse(0); 495 - } 496 - 497 - static void __init test_bitmap_parse_user(void) 498 - { 499 - __test_bitmap_parse(1); 500 479 } 501 480 502 481 #define EXP1_IN_BITS (sizeof(exp1) * 8) ··· 628 675 test_replace(); 629 676 test_bitmap_arr32(); 630 677 test_bitmap_parse(); 631 - test_bitmap_parse_user(); 632 678 test_bitmap_parselist(); 633 - test_bitmap_parselist_user(); 634 679 test_mem_optimisations(); 635 680 test_for_each_set_clump8(); 636 681 test_bitmap_cut();
-2
tools/testing/selftests/lkdtm/tests.txt
··· 9 9 #CORRUPT_STACK_STRONG Crashes entire system on success 10 10 CORRUPT_LIST_ADD list_add corruption 11 11 CORRUPT_LIST_DEL list_del corruption 12 - CORRUPT_USER_DS Invalid address limit on user-mode return 13 12 STACK_GUARD_PAGE_LEADING 14 13 STACK_GUARD_PAGE_TRAILING 15 14 UNSET_SMEP CR4 bits went missing ··· 66 67 USERCOPY_STACK_FRAME_FROM 67 68 USERCOPY_STACK_BEYOND 68 69 USERCOPY_KERNEL 69 - USERCOPY_KERNEL_DS 70 70 STACKLEAK_ERASING OK: the rest of the thread stack is properly erased 71 71 CFI_FORWARD_PROTO