Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+3

Documentation/kernel-parameters.txt

··· 987 987 See Documentation/x86/intel_mpx.txt for more 988 988 information about the feature. 989 989 990 + nopku [X86] Disable Memory Protection Keys CPU feature found 991 + in some Intel CPUs. 992 + 990 993 eagerfpu= [X86] 991 994 on enable eager fpu restore 992 995 off disable eager fpu restore

+2 -6

arch/cris/arch-v32/drivers/cryptocop.c

··· 2719 2719 /* Acquire the mm page semaphore. */ 2720 2720 down_read(&current->mm->mmap_sem); 2721 2721 2722 - err = get_user_pages(current, 2723 - current->mm, 2724 - (unsigned long int)(oper.indata + prev_ix), 2722 + err = get_user_pages((unsigned long int)(oper.indata + prev_ix), 2725 2723 noinpages, 2726 2724 0, /* read access only for in data */ 2727 2725 0, /* no force */ ··· 2734 2736 } 2735 2737 noinpages = err; 2736 2738 if (oper.do_cipher){ 2737 - err = get_user_pages(current, 2738 - current->mm, 2739 - (unsigned long int)oper.cipher_outdata, 2739 + err = get_user_pages((unsigned long int)oper.cipher_outdata, 2740 2740 nooutpages, 2741 2741 1, /* write access for out data */ 2742 2742 0, /* no force */

+9 -4

arch/ia64/include/uapi/asm/siginfo.h

··· 63 63 unsigned int _flags; /* see below */ 64 64 unsigned long _isr; /* isr */ 65 65 short _addr_lsb; /* lsb of faulting address */ 66 - struct { 67 - void __user *_lower; 68 - void __user *_upper; 69 - } _addr_bnd; 66 + union { 67 + /* used when si_code=SEGV_BNDERR */ 68 + struct { 69 + void __user *_lower; 70 + void __user *_upper; 71 + } _addr_bnd; 72 + /* used when si_code=SEGV_PKUERR */ 73 + __u32 _pkey; 74 + }; 70 75 } _sigfault; 71 76 72 77 /* SIGPOLL */

+1 -2

arch/ia64/kernel/err_inject.c

··· 142 142 u64 virt_addr=simple_strtoull(buf, NULL, 16); 143 143 int ret; 144 144 145 - ret = get_user_pages(current, current->mm, virt_addr, 146 - 1, VM_READ, 0, NULL, NULL); 145 + ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL); 147 146 if (ret<=0) { 148 147 #ifdef ERR_INJ_DEBUG 149 148 printk("Virtual address %lx is not existing.\n",virt_addr);

+9 -4

arch/mips/include/uapi/asm/siginfo.h

··· 86 86 int _trapno; /* TRAP # which caused the signal */ 87 87 #endif 88 88 short _addr_lsb; 89 - struct { 90 - void __user *_lower; 91 - void __user *_upper; 92 - } _addr_bnd; 89 + union { 90 + /* used when si_code=SEGV_BNDERR */ 91 + struct { 92 + void __user *_lower; 93 + void __user *_upper; 94 + } _addr_bnd; 95 + /* used when si_code=SEGV_PKUERR */ 96 + __u32 _pkey; 97 + }; 93 98 } _sigfault; 94 99 95 100 /* SIGPOLL, SIGXFSZ (To do ...) */

+1 -2

arch/mips/mm/gup.c

··· 286 286 start += nr << PAGE_SHIFT; 287 287 pages += nr; 288 288 289 - ret = get_user_pages_unlocked(current, mm, start, 290 - (end - start) >> PAGE_SHIFT, 289 + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, 291 290 write, 0, pages); 292 291 293 292 /* Have to be a bit careful with return values */

+3 -2

arch/powerpc/include/asm/mman.h

··· 18 18 * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits() 19 19 * here. How important is the optimization? 20 20 */ 21 - static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot) 21 + static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, 22 + unsigned long pkey) 22 23 { 23 24 return (prot & PROT_SAO) ? VM_SAO : 0; 24 25 } 25 - #define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot) 26 + #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) 26 27 27 28 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) 28 29 {

+12

arch/powerpc/include/asm/mmu_context.h

··· 148 148 { 149 149 } 150 150 151 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 152 + bool write, bool execute, bool foreign) 153 + { 154 + /* by default, allow everything */ 155 + return true; 156 + } 157 + 158 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 159 + { 160 + /* by default, allow everything */ 161 + return true; 162 + } 151 163 #endif /* __KERNEL__ */ 152 164 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */

+12

arch/s390/include/asm/mmu_context.h

··· 136 136 { 137 137 } 138 138 139 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 140 + bool write, bool execute, bool foreign) 141 + { 142 + /* by default, allow everything */ 143 + return true; 144 + } 145 + 146 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 147 + { 148 + /* by default, allow everything */ 149 + return true; 150 + } 139 151 #endif /* __S390_MMU_CONTEXT_H */

+1 -3

arch/s390/mm/gup.c

··· 210 210 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 211 211 struct page **pages) 212 212 { 213 - struct mm_struct *mm = current->mm; 214 213 int nr, ret; 215 214 216 215 might_sleep(); ··· 221 222 /* Try to get the remaining pages with get_user_pages */ 222 223 start += nr << PAGE_SHIFT; 223 224 pages += nr; 224 - ret = get_user_pages_unlocked(current, mm, start, 225 - nr_pages - nr, write, 0, pages); 225 + ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); 226 226 /* Have to be a bit careful with return values */ 227 227 if (nr > 0) 228 228 ret = (ret < 0) ? nr : ret + nr;

+1 -1

arch/sh/mm/gup.c

··· 257 257 start += nr << PAGE_SHIFT; 258 258 pages += nr; 259 259 260 - ret = get_user_pages_unlocked(current, mm, start, 260 + ret = get_user_pages_unlocked(start, 261 261 (end - start) >> PAGE_SHIFT, write, 0, pages); 262 262 263 263 /* Have to be a bit careful with return values */

+1 -1

arch/sparc/mm/gup.c

··· 237 237 start += nr << PAGE_SHIFT; 238 238 pages += nr; 239 239 240 - ret = get_user_pages_unlocked(current, mm, start, 240 + ret = get_user_pages_unlocked(start, 241 241 (end - start) >> PAGE_SHIFT, write, 0, pages); 242 242 243 243 /* Have to be a bit careful with return values */

+14

arch/um/include/asm/mmu_context.h

··· 27 27 struct vm_area_struct *vma) 28 28 { 29 29 } 30 + 31 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 32 + bool write, bool execute, bool foreign) 33 + { 34 + /* by default, allow everything */ 35 + return true; 36 + } 37 + 38 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 39 + { 40 + /* by default, allow everything */ 41 + return true; 42 + } 43 + 30 44 /* 31 45 * end asm-generic/mm_hooks.h functions 32 46 */

+12

arch/unicore32/include/asm/mmu_context.h

··· 97 97 { 98 98 } 99 99 100 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 101 + bool write, bool foreign) 102 + { 103 + /* by default, allow everything */ 104 + return true; 105 + } 106 + 107 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 108 + { 109 + /* by default, allow everything */ 110 + return true; 111 + } 100 112 #endif

+16

arch/x86/Kconfig

··· 156 156 select X86_DEV_DMA_OPS if X86_64 157 157 select X86_FEATURE_NAMES if PROC_FS 158 158 select HAVE_STACK_VALIDATION if X86_64 159 + select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS 160 + select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS 159 161 160 162 config INSTRUCTION_DECODER 161 163 def_bool y ··· 1720 1718 For details, see Documentation/x86/intel_mpx.txt 1721 1719 1722 1720 If unsure, say N. 1721 + 1722 + config X86_INTEL_MEMORY_PROTECTION_KEYS 1723 + prompt "Intel Memory Protection Keys" 1724 + def_bool y 1725 + # Note: only available in 64-bit mode 1726 + depends on CPU_SUP_INTEL && X86_64 1727 + ---help--- 1728 + Memory Protection Keys provides a mechanism for enforcing 1729 + page-based protections, but without requiring modification of the 1730 + page tables when an application changes protection domains. 1731 + 1732 + For details, see Documentation/x86/protection-keys.txt 1733 + 1734 + If unsure, say y. 1723 1735 1724 1736 config EFI 1725 1737 bool "EFI runtime service support"

+35 -20

arch/x86/include/asm/cpufeature.h

··· 26 26 CPUID_8000_0008_EBX, 27 27 CPUID_6_EAX, 28 28 CPUID_8000_000A_EDX, 29 + CPUID_7_ECX, 29 30 }; 30 31 31 32 #ifdef CONFIG_X86_FEATURE_NAMES ··· 49 48 test_bit(bit, (unsigned long *)((c)->x86_capability)) 50 49 51 50 #define REQUIRED_MASK_BIT_SET(bit) \ 52 - ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 53 - (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 54 - (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 55 - (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \ 56 - (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ 57 - (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ 58 - (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 59 - (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 60 - (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 61 - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) 51 + ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \ 52 + (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \ 53 + (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \ 54 + (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \ 55 + (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \ 56 + (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \ 57 + (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \ 58 + (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \ 59 + (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \ 60 + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \ 61 + (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \ 62 + (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \ 63 + (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \ 64 + (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \ 65 + (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ 66 + (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ 67 + (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) 62 68 63 69 #define DISABLED_MASK_BIT_SET(bit) \ 64 - ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \ 65 - (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \ 66 - (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \ 67 - (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \ 68 - (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \ 69 - (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \ 70 - (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \ 71 - (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \ 72 - (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \ 73 - (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) ) 70 + ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \ 71 + (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \ 72 + (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \ 73 + (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \ 74 + (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \ 75 + (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \ 76 + (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \ 77 + (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \ 78 + (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \ 79 + (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \ 80 + (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \ 81 + (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \ 82 + (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \ 83 + (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \ 84 + (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ 85 + (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ 86 + (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) 74 87 75 88 #define cpu_has(c, bit) \ 76 89 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \

+5 -1

arch/x86/include/asm/cpufeatures.h

··· 12 12 /* 13 13 * Defines x86 CPU feature bits 14 14 */ 15 - #define NCAPINTS 16 /* N 32-bit words worth of info */ 15 + #define NCAPINTS 17 /* N 32-bit words worth of info */ 16 16 #define NBUGINTS 1 /* N 32-bit bug flags */ 17 17 18 18 /* ··· 273 273 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ 274 274 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ 275 275 #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ 276 + 277 + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ 278 + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ 279 + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ 276 280 277 281 /* 278 282 * BUG word(s)

+15

arch/x86/include/asm/disabled-features.h

··· 28 28 # define DISABLE_CENTAUR_MCR 0 29 29 #endif /* CONFIG_X86_64 */ 30 30 31 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 32 + # define DISABLE_PKU (1<<(X86_FEATURE_PKU)) 33 + # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE)) 34 + #else 35 + # define DISABLE_PKU 0 36 + # define DISABLE_OSPKE 0 37 + #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ 38 + 31 39 /* 32 40 * Make sure to add features to the correct mask 33 41 */ ··· 49 41 #define DISABLED_MASK7 0 50 42 #define DISABLED_MASK8 0 51 43 #define DISABLED_MASK9 (DISABLE_MPX) 44 + #define DISABLED_MASK10 0 45 + #define DISABLED_MASK11 0 46 + #define DISABLED_MASK12 0 47 + #define DISABLED_MASK13 0 48 + #define DISABLED_MASK14 0 49 + #define DISABLED_MASK15 0 50 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) 52 51 53 52 #endif /* _ASM_X86_DISABLED_FEATURES_H */

+2

arch/x86/include/asm/fpu/internal.h

··· 25 25 extern void fpu__activate_curr(struct fpu *fpu); 26 26 extern void fpu__activate_fpstate_read(struct fpu *fpu); 27 27 extern void fpu__activate_fpstate_write(struct fpu *fpu); 28 + extern void fpu__current_fpstate_write_begin(void); 29 + extern void fpu__current_fpstate_write_end(void); 28 30 extern void fpu__save(struct fpu *fpu); 29 31 extern void fpu__restore(struct fpu *fpu); 30 32 extern int fpu__restore_sig(void __user *buf, int ia32_frame);

+12

arch/x86/include/asm/fpu/types.h

··· 108 108 XFEATURE_OPMASK, 109 109 XFEATURE_ZMM_Hi256, 110 110 XFEATURE_Hi16_ZMM, 111 + XFEATURE_PT_UNIMPLEMENTED_SO_FAR, 112 + XFEATURE_PKRU, 111 113 112 114 XFEATURE_MAX, 113 115 }; ··· 122 120 #define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) 123 121 #define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) 124 122 #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) 123 + #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) 125 124 126 125 #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) 127 126 #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ ··· 213 210 */ 214 211 struct avx_512_hi16_state { 215 212 struct reg_512_bit hi16_zmm[16]; 213 + } __packed; 214 + 215 + /* 216 + * State component 9: 32-bit PKRU register. The state is 217 + * 8 bytes long but only 4 bytes is used currently. 218 + */ 219 + struct pkru_state { 220 + u32 pkru; 221 + u32 pad; 216 222 } __packed; 217 223 218 224 struct xstate_header {

+2 -1

arch/x86/include/asm/fpu/xstate.h

··· 24 24 XFEATURE_MASK_YMM | \ 25 25 XFEATURE_MASK_OPMASK | \ 26 26 XFEATURE_MASK_ZMM_Hi256 | \ 27 - XFEATURE_MASK_Hi16_ZMM) 27 + XFEATURE_MASK_Hi16_ZMM | \ 28 + XFEATURE_MASK_PKRU) 28 29 29 30 /* Supported features which require eager state saving */ 30 31 #define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)

+80 -5

arch/x86/include/asm/mmu_context.h

··· 52 52 /* 53 53 * Used for LDT copy/destruction. 54 54 */ 55 - int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 56 - void destroy_context(struct mm_struct *mm); 55 + int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); 56 + void destroy_context_ldt(struct mm_struct *mm); 57 57 #else /* CONFIG_MODIFY_LDT_SYSCALL */ 58 - static inline int init_new_context(struct task_struct *tsk, 59 - struct mm_struct *mm) 58 + static inline int init_new_context_ldt(struct task_struct *tsk, 59 + struct mm_struct *mm) 60 60 { 61 61 return 0; 62 62 } 63 - static inline void destroy_context(struct mm_struct *mm) {} 63 + static inline void destroy_context_ldt(struct mm_struct *mm) {} 64 64 #endif 65 65 66 66 static inline void load_mm_ldt(struct mm_struct *mm) ··· 102 102 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 103 103 this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 104 104 #endif 105 + } 106 + 107 + static inline int init_new_context(struct task_struct *tsk, 108 + struct mm_struct *mm) 109 + { 110 + init_new_context_ldt(tsk, mm); 111 + return 0; 112 + } 113 + static inline void destroy_context(struct mm_struct *mm) 114 + { 115 + destroy_context_ldt(mm); 105 116 } 106 117 107 118 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ··· 284 273 */ 285 274 if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) 286 275 mpx_notify_unmap(mm, vma, start, end); 276 + } 277 + 278 + static inline int vma_pkey(struct vm_area_struct *vma) 279 + { 280 + u16 pkey = 0; 281 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 282 + unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | 283 + VM_PKEY_BIT2 | VM_PKEY_BIT3; 284 + pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; 285 + #endif 286 + return pkey; 287 + } 288 + 289 + static inline bool __pkru_allows_pkey(u16 pkey, bool write) 290 + { 291 + u32 pkru = read_pkru(); 292 + 293 + if (!__pkru_allows_read(pkru, pkey)) 294 + return false; 295 + if (write && !__pkru_allows_write(pkru, pkey)) 296 + return false; 297 + 298 + return true; 299 + } 300 + 301 + /* 302 + * We only want to enforce protection keys on the current process 303 + * because we effectively have no access to PKRU for other 304 + * processes or any way to tell *which * PKRU in a threaded 305 + * process we could use. 306 + * 307 + * So do not enforce things if the VMA is not from the current 308 + * mm, or if we are in a kernel thread. 309 + */ 310 + static inline bool vma_is_foreign(struct vm_area_struct *vma) 311 + { 312 + if (!current->mm) 313 + return true; 314 + /* 315 + * Should PKRU be enforced on the access to this VMA? If 316 + * the VMA is from another process, then PKRU has no 317 + * relevance and should not be enforced. 318 + */ 319 + if (current->mm != vma->vm_mm) 320 + return true; 321 + 322 + return false; 323 + } 324 + 325 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 326 + bool write, bool execute, bool foreign) 327 + { 328 + /* pkeys never affect instruction fetches */ 329 + if (execute) 330 + return true; 331 + /* allow access if the VMA is not one from this process */ 332 + if (foreign || vma_is_foreign(vma)) 333 + return true; 334 + return __pkru_allows_pkey(vma_pkey(vma), write); 335 + } 336 + 337 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 338 + { 339 + return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); 287 340 } 288 341 289 342 #endif /* _ASM_X86_MMU_CONTEXT_H */

+38

arch/x86/include/asm/pgtable.h

··· 99 99 return pte_flags(pte) & _PAGE_DIRTY; 100 100 } 101 101 102 + 103 + static inline u32 read_pkru(void) 104 + { 105 + if (boot_cpu_has(X86_FEATURE_OSPKE)) 106 + return __read_pkru(); 107 + return 0; 108 + } 109 + 102 110 static inline int pte_young(pte_t pte) 103 111 { 104 112 return pte_flags(pte) & _PAGE_ACCESSED; ··· 918 910 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 919 911 } 920 912 #endif 913 + 914 + #define PKRU_AD_BIT 0x1 915 + #define PKRU_WD_BIT 0x2 916 + #define PKRU_BITS_PER_PKEY 2 917 + 918 + static inline bool __pkru_allows_read(u32 pkru, u16 pkey) 919 + { 920 + int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; 921 + return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); 922 + } 923 + 924 + static inline bool __pkru_allows_write(u32 pkru, u16 pkey) 925 + { 926 + int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; 927 + /* 928 + * Access-disable disables writes too so we need to check 929 + * both bits here. 930 + */ 931 + return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); 932 + } 933 + 934 + static inline u16 pte_flags_pkey(unsigned long pte_flags) 935 + { 936 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 937 + /* ifdef to avoid doing 59-bit shift on 32-bit values */ 938 + return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0; 939 + #else 940 + return 0; 941 + #endif 942 + } 921 943 922 944 #include <asm-generic/pgtable.h> 923 945 #endif /* __ASSEMBLY__ */

+34 -5

arch/x86/include/asm/pgtable_types.h

··· 20 20 #define _PAGE_BIT_SOFTW2 10 /* " */ 21 21 #define _PAGE_BIT_SOFTW3 11 /* " */ 22 22 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 23 + #define _PAGE_BIT_SOFTW4 58 /* available for programmer */ 24 + #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ 25 + #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ 26 + #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ 27 + #define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ 28 + #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 29 + 23 30 #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 24 31 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 25 32 #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ 26 33 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 27 - #define _PAGE_BIT_SOFTW4 58 /* available for programmer */ 28 - #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 29 - #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 34 + #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 30 35 31 36 /* If _PAGE_BIT_PRESENT is clear, we use these: */ 32 37 /* - if the user mapped it with PROT_NONE; pte_present gives true */ ··· 52 47 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 53 48 #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 54 49 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 50 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 51 + #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) 52 + #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) 53 + #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2) 54 + #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3) 55 + #else 56 + #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0)) 57 + #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0)) 58 + #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) 59 + #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) 60 + #endif 55 61 #define __HAVE_ARCH_PTE_SPECIAL 62 + 63 + #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ 64 + _PAGE_PKEY_BIT1 | \ 65 + _PAGE_PKEY_BIT2 | \ 66 + _PAGE_PKEY_BIT3) 56 67 57 68 #ifdef CONFIG_KMEMCHECK 58 69 #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) ··· 120 99 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 121 100 _PAGE_DIRTY) 122 101 123 - /* Set of bits not changed in pte_modify */ 102 + /* 103 + * Set of bits not changed in pte_modify. The pte's 104 + * protection key is treated like _PAGE_RW, for 105 + * instance, and is *not* included in this mask since 106 + * pte_modify() does modify it. 107 + */ 124 108 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 125 109 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 126 110 _PAGE_SOFT_DIRTY) ··· 241 215 /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ 242 216 #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) 243 217 244 - /* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ 218 + /* 219 + * Extracts the flags from a (pte|pmd|pud|pgd)val_t 220 + * This includes the protection key value. 221 + */ 245 222 #define PTE_FLAGS_MASK (~PTE_PFN_MASK) 246 223 247 224 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

+34

arch/x86/include/asm/pkeys.h

··· 1 + #ifndef _ASM_X86_PKEYS_H 2 + #define _ASM_X86_PKEYS_H 3 + 4 + #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) 5 + 6 + extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 7 + unsigned long init_val); 8 + 9 + /* 10 + * Try to dedicate one of the protection keys to be used as an 11 + * execute-only protection key. 12 + */ 13 + #define PKEY_DEDICATED_EXECUTE_ONLY 15 14 + extern int __execute_only_pkey(struct mm_struct *mm); 15 + static inline int execute_only_pkey(struct mm_struct *mm) 16 + { 17 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 18 + return 0; 19 + 20 + return __execute_only_pkey(mm); 21 + } 22 + 23 + extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, 24 + int prot, int pkey); 25 + static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, 26 + int prot, int pkey) 27 + { 28 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 29 + return 0; 30 + 31 + return __arch_override_mprotect_pkey(vma, prot, pkey); 32 + } 33 + 34 + #endif /*_ASM_X86_PKEYS_H */

+7

arch/x86/include/asm/required-features.h

··· 92 92 #define REQUIRED_MASK7 0 93 93 #define REQUIRED_MASK8 0 94 94 #define REQUIRED_MASK9 0 95 + #define REQUIRED_MASK10 0 96 + #define REQUIRED_MASK11 0 97 + #define REQUIRED_MASK12 0 98 + #define REQUIRED_MASK13 0 99 + #define REQUIRED_MASK14 0 100 + #define REQUIRED_MASK15 0 101 + #define REQUIRED_MASK16 0 95 102 96 103 #endif /* _ASM_X86_REQUIRED_FEATURES_H */

+22

arch/x86/include/asm/special_insns.h

··· 98 98 } 99 99 #endif 100 100 101 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 102 + static inline u32 __read_pkru(void) 103 + { 104 + u32 ecx = 0; 105 + u32 edx, pkru; 106 + 107 + /* 108 + * "rdpkru" instruction. Places PKRU contents in to EAX, 109 + * clears EDX and requires that ecx=0. 110 + */ 111 + asm volatile(".byte 0x0f,0x01,0xee\n\t" 112 + : "=a" (pkru), "=d" (edx) 113 + : "c" (ecx)); 114 + return pkru; 115 + } 116 + #else 117 + static inline u32 __read_pkru(void) 118 + { 119 + return 0; 120 + } 121 + #endif 122 + 101 123 static inline void native_wbinvd(void) 102 124 { 103 125 asm volatile("wbinvd": : :"memory");

+22

arch/x86/include/uapi/asm/mman.h

··· 6 6 #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) 7 7 #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) 8 8 9 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 10 + /* 11 + * Take the 4 protection key bits out of the vma->vm_flags 12 + * value and turn them in to the bits that we can put in 13 + * to a pte. 14 + * 15 + * Only override these if Protection Keys are available 16 + * (which is only on 64-bit). 17 + */ 18 + #define arch_vm_get_page_prot(vm_flags) __pgprot( \ 19 + ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \ 20 + ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \ 21 + ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \ 22 + ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0)) 23 + 24 + #define arch_calc_vm_prot_bits(prot, key) ( \ 25 + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ 26 + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ 27 + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ 28 + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) 29 + #endif 30 + 9 31 #include <asm-generic/mman.h> 10 32 11 33 #endif /* _ASM_X86_MMAN_H */

+2

arch/x86/include/uapi/asm/processor-flags.h

··· 118 118 #define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT) 119 119 #define X86_CR4_SMAP_BIT 21 /* enable SMAP support */ 120 120 #define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT) 121 + #define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */ 122 + #define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) 121 123 122 124 /* 123 125 * x86-64 Task Priority Register, CR8

+44

arch/x86/kernel/cpu/common.c

··· 304 304 } 305 305 306 306 /* 307 + * Protection Keys are not available in 32-bit mode. 308 + */ 309 + static bool pku_disabled; 310 + 311 + static __always_inline void setup_pku(struct cpuinfo_x86 *c) 312 + { 313 + if (!cpu_has(c, X86_FEATURE_PKU)) 314 + return; 315 + if (pku_disabled) 316 + return; 317 + 318 + cr4_set_bits(X86_CR4_PKE); 319 + /* 320 + * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE 321 + * cpuid bit to be set. We need to ensure that we 322 + * update that bit in this CPU's "cpu_info". 323 + */ 324 + get_cpu_cap(c); 325 + } 326 + 327 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 328 + static __init int setup_disable_pku(char *arg) 329 + { 330 + /* 331 + * Do not clear the X86_FEATURE_PKU bit. All of the 332 + * runtime checks are against OSPKE so clearing the 333 + * bit does nothing. 334 + * 335 + * This way, we will see "pku" in cpuinfo, but not 336 + * "ospke", which is exactly what we want. It shows 337 + * that the CPU has PKU, but the OS has not enabled it. 338 + * This happens to be exactly how a system would look 339 + * if we disabled the config option. 340 + */ 341 + pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n"); 342 + pku_disabled = true; 343 + return 1; 344 + } 345 + __setup("nopku", setup_disable_pku); 346 + #endif /* CONFIG_X86_64 */ 347 + 348 + /* 307 349 * Some CPU features depend on higher CPUID levels, which may not always 308 350 * be available due to CPUID level capping or broken virtualization 309 351 * software. Add those features to this table to auto-disable them. ··· 667 625 c->x86_capability[CPUID_7_0_EBX] = ebx; 668 626 669 627 c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); 628 + c->x86_capability[CPUID_7_ECX] = ecx; 670 629 } 671 630 672 631 /* Extended state features: level 0x0000000d */ ··· 1025 982 init_hypervisor(c); 1026 983 x86_init_rdrand(c); 1027 984 x86_init_cache_qos(c); 985 + setup_pku(c); 1028 986 1029 987 /* 1030 988 * Clear/Set all flags overriden by options, need do it

+63

arch/x86/kernel/fpu/core.c

··· 354 354 } 355 355 356 356 /* 357 + * This function must be called before we write the current 358 + * task's fpstate. 359 + * 360 + * This call gets the current FPU register state and moves 361 + * it in to the 'fpstate'. Preemption is disabled so that 362 + * no writes to the 'fpstate' can occur from context 363 + * swiches. 364 + * 365 + * Must be followed by a fpu__current_fpstate_write_end(). 366 + */ 367 + void fpu__current_fpstate_write_begin(void) 368 + { 369 + struct fpu *fpu = &current->thread.fpu; 370 + 371 + /* 372 + * Ensure that the context-switching code does not write 373 + * over the fpstate while we are doing our update. 374 + */ 375 + preempt_disable(); 376 + 377 + /* 378 + * Move the fpregs in to the fpu's 'fpstate'. 379 + */ 380 + fpu__activate_fpstate_read(fpu); 381 + 382 + /* 383 + * The caller is about to write to 'fpu'. Ensure that no 384 + * CPU thinks that its fpregs match the fpstate. This 385 + * ensures we will not be lazy and skip a XRSTOR in the 386 + * future. 387 + */ 388 + fpu->last_cpu = -1; 389 + } 390 + 391 + /* 392 + * This function must be paired with fpu__current_fpstate_write_begin() 393 + * 394 + * This will ensure that the modified fpstate gets placed back in 395 + * the fpregs if necessary. 396 + * 397 + * Note: This function may be called whether or not an _actual_ 398 + * write to the fpstate occurred. 399 + */ 400 + void fpu__current_fpstate_write_end(void) 401 + { 402 + struct fpu *fpu = &current->thread.fpu; 403 + 404 + /* 405 + * 'fpu' now has an updated copy of the state, but the 406 + * registers may still be out of date. Update them with 407 + * an XRSTOR if they are active. 408 + */ 409 + if (fpregs_active()) 410 + copy_kernel_to_fpregs(&fpu->state); 411 + 412 + /* 413 + * Our update is done and the fpregs/fpstate are in sync 414 + * if necessary. Context switches can happen again. 415 + */ 416 + preempt_enable(); 417 + } 418 + 419 + /* 357 420 * 'fpu__restore()' is called to copy FPU registers from 358 421 * the FPU fpstate to the live hw registers and to activate 359 422 * access to the hardware registers, so that FPU instructions

+181 -4

arch/x86/kernel/fpu/xstate.c

··· 5 5 */ 6 6 #include <linux/compat.h> 7 7 #include <linux/cpu.h> 8 + #include <linux/pkeys.h> 8 9 9 10 #include <asm/fpu/api.h> 10 11 #include <asm/fpu/internal.h> ··· 14 13 15 14 #include <asm/tlbflush.h> 16 15 16 + /* 17 + * Although we spell it out in here, the Processor Trace 18 + * xfeature is completely unused. We use other mechanisms 19 + * to save/restore PT state in Linux. 20 + */ 17 21 static const char *xfeature_names[] = 18 22 { 19 23 "x87 floating point registers" , ··· 29 23 "AVX-512 opmask" , 30 24 "AVX-512 Hi256" , 31 25 "AVX-512 ZMM_Hi256" , 26 + "Processor Trace (unused)" , 27 + "Protection Keys User registers", 32 28 "unknown xstate feature" , 33 29 }; 34 30 ··· 64 56 setup_clear_cpu_cap(X86_FEATURE_AVX512VL); 65 57 setup_clear_cpu_cap(X86_FEATURE_MPX); 66 58 setup_clear_cpu_cap(X86_FEATURE_XGETBV1); 59 + setup_clear_cpu_cap(X86_FEATURE_PKU); 67 60 } 68 61 69 62 /* ··· 243 234 const char *feature_name; 244 235 245 236 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 246 - pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name); 237 + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); 247 238 } 248 239 249 240 /* ··· 259 250 print_xstate_feature(XFEATURE_MASK_OPMASK); 260 251 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 261 252 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 253 + print_xstate_feature(XFEATURE_MASK_PKRU); 262 254 } 263 255 264 256 /* ··· 476 466 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); 477 467 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); 478 468 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); 469 + XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); 479 470 480 471 /* 481 472 * Make *SURE* to add any feature numbers in below if ··· 484 473 * numbers. 485 474 */ 486 475 if ((nr < XFEATURE_YMM) || 487 - (nr >= XFEATURE_MAX)) { 476 + (nr >= XFEATURE_MAX) || 477 + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { 488 478 WARN_ONCE(1, "no structure for xstate: %d\n", nr); 489 479 XSTATE_WARN_ON(1); 490 480 } ··· 683 671 } 684 672 685 673 /* 674 + * Given an xstate feature mask, calculate where in the xsave 675 + * buffer the state is. Callers should ensure that the buffer 676 + * is valid. 677 + * 678 + * Note: does not work for compacted buffers. 679 + */ 680 + void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) 681 + { 682 + int feature_nr = fls64(xstate_feature_mask) - 1; 683 + 684 + return (void *)xsave + xstate_comp_offsets[feature_nr]; 685 + } 686 + /* 686 687 * Given the xsave area and a state inside, this function returns the 687 688 * address of the state. 688 689 * ··· 715 690 */ 716 691 void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) 717 692 { 718 - int feature_nr = fls64(xstate_feature) - 1; 719 693 /* 720 694 * Do we even *have* xsave state? 721 695 */ ··· 742 718 if (!(xsave->header.xfeatures & xstate_feature)) 743 719 return NULL; 744 720 745 - return (void *)xsave + xstate_comp_offsets[feature_nr]; 721 + return __raw_xsave_addr(xsave, xstate_feature); 746 722 } 747 723 EXPORT_SYMBOL_GPL(get_xsave_addr); 748 724 ··· 776 752 fpu__save(fpu); 777 753 778 754 return get_xsave_addr(&fpu->state.xsave, xsave_state); 755 + } 756 + 757 + 758 + /* 759 + * Set xfeatures (aka XSTATE_BV) bit for a feature that we want 760 + * to take out of its "init state". This will ensure that an 761 + * XRSTOR actually restores the state. 762 + */ 763 + static void fpu__xfeature_set_non_init(struct xregs_state *xsave, 764 + int xstate_feature_mask) 765 + { 766 + xsave->header.xfeatures |= xstate_feature_mask; 767 + } 768 + 769 + /* 770 + * This function is safe to call whether the FPU is in use or not. 771 + * 772 + * Note that this only works on the current task. 773 + * 774 + * Inputs: 775 + * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, 776 + * XFEATURE_MASK_SSE, etc...) 777 + * @xsave_state_ptr: a pointer to a copy of the state that you would 778 + * like written in to the current task's FPU xsave state. This pointer 779 + * must not be located in the current tasks's xsave area. 780 + * Output: 781 + * address of the state in the xsave area or NULL if the state 782 + * is not present or is in its 'init state'. 783 + */ 784 + static void fpu__xfeature_set_state(int xstate_feature_mask, 785 + void *xstate_feature_src, size_t len) 786 + { 787 + struct xregs_state *xsave = &current->thread.fpu.state.xsave; 788 + struct fpu *fpu = &current->thread.fpu; 789 + void *dst; 790 + 791 + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 792 + WARN_ONCE(1, "%s() attempted with no xsave support", __func__); 793 + return; 794 + } 795 + 796 + /* 797 + * Tell the FPU code that we need the FPU state to be in 798 + * 'fpu' (not in the registers), and that we need it to 799 + * be stable while we write to it. 800 + */ 801 + fpu__current_fpstate_write_begin(); 802 + 803 + /* 804 + * This method *WILL* *NOT* work for compact-format 805 + * buffers. If the 'xstate_feature_mask' is unset in 806 + * xcomp_bv then we may need to move other feature state 807 + * "up" in the buffer. 808 + */ 809 + if (xsave->header.xcomp_bv & xstate_feature_mask) { 810 + WARN_ON_ONCE(1); 811 + goto out; 812 + } 813 + 814 + /* find the location in the xsave buffer of the desired state */ 815 + dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); 816 + 817 + /* 818 + * Make sure that the pointer being passed in did not 819 + * come from the xsave buffer itself. 820 + */ 821 + WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); 822 + 823 + /* put the caller-provided data in the location */ 824 + memcpy(dst, xstate_feature_src, len); 825 + 826 + /* 827 + * Mark the xfeature so that the CPU knows there is state 828 + * in the buffer now. 829 + */ 830 + fpu__xfeature_set_non_init(xsave, xstate_feature_mask); 831 + out: 832 + /* 833 + * We are done writing to the 'fpu'. Reenable preeption 834 + * and (possibly) move the fpstate back in to the fpregs. 835 + */ 836 + fpu__current_fpstate_write_end(); 837 + } 838 + 839 + #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) 840 + #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) 841 + 842 + /* 843 + * This will go out and modify the XSAVE buffer so that PKRU is 844 + * set to a particular state for access to 'pkey'. 845 + * 846 + * PKRU state does affect kernel access to user memory. We do 847 + * not modfiy PKRU *itself* here, only the XSAVE state that will 848 + * be restored in to PKRU when we return back to userspace. 849 + */ 850 + int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 851 + unsigned long init_val) 852 + { 853 + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; 854 + struct pkru_state *old_pkru_state; 855 + struct pkru_state new_pkru_state; 856 + int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); 857 + u32 new_pkru_bits = 0; 858 + 859 + /* 860 + * This check implies XSAVE support. OSPKE only gets 861 + * set if we enable XSAVE and we enable PKU in XCR0. 862 + */ 863 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 864 + return -EINVAL; 865 + 866 + /* Set the bits we need in PKRU */ 867 + if (init_val & PKEY_DISABLE_ACCESS) 868 + new_pkru_bits |= PKRU_AD_BIT; 869 + if (init_val & PKEY_DISABLE_WRITE) 870 + new_pkru_bits |= PKRU_WD_BIT; 871 + 872 + /* Shift the bits in to the correct place in PKRU for pkey. */ 873 + new_pkru_bits <<= pkey_shift; 874 + 875 + /* Locate old copy of the state in the xsave buffer */ 876 + old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); 877 + 878 + /* 879 + * When state is not in the buffer, it is in the init 880 + * state, set it manually. Otherwise, copy out the old 881 + * state. 882 + */ 883 + if (!old_pkru_state) 884 + new_pkru_state.pkru = 0; 885 + else 886 + new_pkru_state.pkru = old_pkru_state->pkru; 887 + 888 + /* mask off any old bits in place */ 889 + new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 890 + /* Set the newly-requested bits */ 891 + new_pkru_state.pkru |= new_pkru_bits; 892 + 893 + /* 894 + * We could theoretically live without zeroing pkru.pad. 895 + * The current XSAVE feature state definition says that 896 + * only bytes 0->3 are used. But we do not want to 897 + * chance leaking kernel stack out to userspace in case a 898 + * memcpy() of the whole xsave buffer was done. 899 + * 900 + * They're in the same cacheline anyway. 901 + */ 902 + new_pkru_state.pad = 0; 903 + 904 + fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, 905 + sizeof(new_pkru_state)); 906 + 907 + return 0; 779 908 }

+2 -2

arch/x86/kernel/ldt.c

··· 103 103 * we do not have to muck with descriptors here, that is 104 104 * done in switch_mm() as needed. 105 105 */ 106 - int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 106 + int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) 107 107 { 108 108 struct ldt_struct *new_ldt; 109 109 struct mm_struct *old_mm; ··· 144 144 * 145 145 * 64bit: Don't touch the LDT register - we're already in the next thread. 146 146 */ 147 - void destroy_context(struct mm_struct *mm) 147 + void destroy_context_ldt(struct mm_struct *mm) 148 148 { 149 149 free_ldt_struct(mm->context.ldt); 150 150 mm->context.ldt = NULL;

+2

arch/x86/kernel/process_64.c

··· 116 116 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 117 117 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 118 118 119 + if (boot_cpu_has(X86_FEATURE_OSPKE)) 120 + printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); 119 121 } 120 122 121 123 void release_thread(struct task_struct *dead_task)

+9

arch/x86/kernel/setup.c

··· 112 112 #include <asm/alternative.h> 113 113 #include <asm/prom.h> 114 114 #include <asm/microcode.h> 115 + #include <asm/mmu_context.h> 115 116 116 117 /* 117 118 * max_low_pfn_mapped: highest direct mapped pfn under 4GB ··· 1283 1282 return 0; 1284 1283 } 1285 1284 __initcall(register_kernel_offset_dumper); 1285 + 1286 + void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) 1287 + { 1288 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 1289 + return; 1290 + 1291 + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); 1292 + }

+2

arch/x86/mm/Makefile

··· 34 34 obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 35 35 36 36 obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 37 + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 38 +

+128 -22

arch/x86/mm/fault.c

··· 15 15 #include <linux/context_tracking.h> /* exception_enter(), ... */ 16 16 #include <linux/uaccess.h> /* faulthandler_disabled() */ 17 17 18 + #include <asm/cpufeature.h> /* boot_cpu_has, ... */ 18 19 #include <asm/traps.h> /* dotraplinkage, ... */ 19 20 #include <asm/pgalloc.h> /* pgd_*(), ... */ 20 21 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 21 22 #include <asm/fixmap.h> /* VSYSCALL_ADDR */ 22 23 #include <asm/vsyscall.h> /* emulate_vsyscall */ 23 24 #include <asm/vm86.h> /* struct vm86 */ 25 + #include <asm/mmu_context.h> /* vma_pkey() */ 24 26 25 27 #define CREATE_TRACE_POINTS 26 28 #include <asm/trace/exceptions.h> ··· 35 33 * bit 2 == 0: kernel-mode access 1: user-mode access 36 34 * bit 3 == 1: use of reserved bit detected 37 35 * bit 4 == 1: fault was an instruction fetch 36 + * bit 5 == 1: protection keys block access 38 37 */ 39 38 enum x86_pf_error_code { 40 39 ··· 44 41 PF_USER = 1 << 2, 45 42 PF_RSVD = 1 << 3, 46 43 PF_INSTR = 1 << 4, 44 + PF_PK = 1 << 5, 47 45 }; 48 46 49 47 /* ··· 171 167 return prefetch; 172 168 } 173 169 170 + /* 171 + * A protection key fault means that the PKRU value did not allow 172 + * access to some PTE. Userspace can figure out what PKRU was 173 + * from the XSAVE state, and this function fills out a field in 174 + * siginfo so userspace can discover which protection key was set 175 + * on the PTE. 176 + * 177 + * If we get here, we know that the hardware signaled a PF_PK 178 + * fault and that there was a VMA once we got in the fault 179 + * handler. It does *not* guarantee that the VMA we find here 180 + * was the one that we faulted on. 181 + * 182 + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); 183 + * 2. T1 : set PKRU to deny access to pkey=4, touches page 184 + * 3. T1 : faults... 185 + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); 186 + * 5. T1 : enters fault handler, takes mmap_sem, etc... 187 + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really 188 + * faulted on a pte with its pkey=4. 189 + */ 190 + static void fill_sig_info_pkey(int si_code, siginfo_t *info, 191 + struct vm_area_struct *vma) 192 + { 193 + /* This is effectively an #ifdef */ 194 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 195 + return; 196 + 197 + /* Fault not from Protection Keys: nothing to do */ 198 + if (si_code != SEGV_PKUERR) 199 + return; 200 + /* 201 + * force_sig_info_fault() is called from a number of 202 + * contexts, some of which have a VMA and some of which 203 + * do not. The PF_PK handing happens after we have a 204 + * valid VMA, so we should never reach this without a 205 + * valid VMA. 206 + */ 207 + if (!vma) { 208 + WARN_ONCE(1, "PKU fault with no VMA passed in"); 209 + info->si_pkey = 0; 210 + return; 211 + } 212 + /* 213 + * si_pkey should be thought of as a strong hint, but not 214 + * absolutely guranteed to be 100% accurate because of 215 + * the race explained above. 216 + */ 217 + info->si_pkey = vma_pkey(vma); 218 + } 219 + 174 220 static void 175 221 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 176 - struct task_struct *tsk, int fault) 222 + struct task_struct *tsk, struct vm_area_struct *vma, 223 + int fault) 177 224 { 178 225 unsigned lsb = 0; 179 226 siginfo_t info; ··· 238 183 if (fault & VM_FAULT_HWPOISON) 239 184 lsb = PAGE_SHIFT; 240 185 info.si_addr_lsb = lsb; 186 + 187 + fill_sig_info_pkey(si_code, &info, vma); 241 188 242 189 force_sig_info(si_signo, &info, tsk); 243 190 } ··· 718 661 struct task_struct *tsk = current; 719 662 unsigned long flags; 720 663 int sig; 664 + /* No context means no VMA to pass down */ 665 + struct vm_area_struct *vma = NULL; 721 666 722 667 /* Are we prepared to handle this kernel fault? */ 723 668 if (fixup_exception(regs, X86_TRAP_PF)) { ··· 743 684 tsk->thread.cr2 = address; 744 685 745 686 /* XXX: hwpoison faults will set the wrong code. */ 746 - force_sig_info_fault(signal, si_code, address, tsk, 0); 687 + force_sig_info_fault(signal, si_code, address, 688 + tsk, vma, 0); 747 689 } 748 690 749 691 /* ··· 821 761 822 762 static void 823 763 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 824 - unsigned long address, int si_code) 764 + unsigned long address, struct vm_area_struct *vma, 765 + int si_code) 825 766 { 826 767 struct task_struct *tsk = current; 827 768 ··· 865 804 tsk->thread.error_code = error_code; 866 805 tsk->thread.trap_nr = X86_TRAP_PF; 867 806 868 - force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); 807 + force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); 869 808 870 809 return; 871 810 } ··· 878 817 879 818 static noinline void 880 819 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 881 - unsigned long address) 820 + unsigned long address, struct vm_area_struct *vma) 882 821 { 883 - __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 822 + __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); 884 823 } 885 824 886 825 static void 887 826 __bad_area(struct pt_regs *regs, unsigned long error_code, 888 - unsigned long address, int si_code) 827 + unsigned long address, struct vm_area_struct *vma, int si_code) 889 828 { 890 829 struct mm_struct *mm = current->mm; 891 830 ··· 895 834 */ 896 835 up_read(&mm->mmap_sem); 897 836 898 - __bad_area_nosemaphore(regs, error_code, address, si_code); 837 + __bad_area_nosemaphore(regs, error_code, address, vma, si_code); 899 838 } 900 839 901 840 static noinline void 902 841 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 903 842 { 904 - __bad_area(regs, error_code, address, SEGV_MAPERR); 843 + __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); 844 + } 845 + 846 + static inline bool bad_area_access_from_pkeys(unsigned long error_code, 847 + struct vm_area_struct *vma) 848 + { 849 + /* This code is always called on the current mm */ 850 + bool foreign = false; 851 + 852 + if (!boot_cpu_has(X86_FEATURE_OSPKE)) 853 + return false; 854 + if (error_code & PF_PK) 855 + return true; 856 + /* this checks permission keys on the VMA: */ 857 + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), 858 + (error_code & PF_INSTR), foreign)) 859 + return true; 860 + return false; 905 861 } 906 862 907 863 static noinline void 908 864 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 909 - unsigned long address) 865 + unsigned long address, struct vm_area_struct *vma) 910 866 { 911 - __bad_area(regs, error_code, address, SEGV_ACCERR); 867 + /* 868 + * This OSPKE check is not strictly necessary at runtime. 869 + * But, doing it this way allows compiler optimizations 870 + * if pkeys are compiled out. 871 + */ 872 + if (bad_area_access_from_pkeys(error_code, vma)) 873 + __bad_area(regs, error_code, address, vma, SEGV_PKUERR); 874 + else 875 + __bad_area(regs, error_code, address, vma, SEGV_ACCERR); 912 876 } 913 877 914 878 static void 915 879 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 916 - unsigned int fault) 880 + struct vm_area_struct *vma, unsigned int fault) 917 881 { 918 882 struct task_struct *tsk = current; 919 883 int code = BUS_ADRERR; ··· 965 879 code = BUS_MCEERR_AR; 966 880 } 967 881 #endif 968 - force_sig_info_fault(SIGBUS, code, address, tsk, fault); 882 + force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); 969 883 } 970 884 971 885 static noinline void 972 886 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 973 - unsigned long address, unsigned int fault) 887 + unsigned long address, struct vm_area_struct *vma, 888 + unsigned int fault) 974 889 { 975 890 if (fatal_signal_pending(current) && !(error_code & PF_USER)) { 976 891 no_context(regs, error_code, address, 0, 0); ··· 995 908 } else { 996 909 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 997 910 VM_FAULT_HWPOISON_LARGE)) 998 - do_sigbus(regs, error_code, address, fault); 911 + do_sigbus(regs, error_code, address, vma, fault); 999 912 else if (fault & VM_FAULT_SIGSEGV) 1000 - bad_area_nosemaphore(regs, error_code, address); 913 + bad_area_nosemaphore(regs, error_code, address, vma); 1001 914 else 1002 915 BUG(); 1003 916 } ··· 1010 923 1011 924 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 1012 925 return 0; 926 + /* 927 + * Note: We do not do lazy flushing on protection key 928 + * changes, so no spurious fault will ever set PF_PK. 929 + */ 930 + if ((error_code & PF_PK)) 931 + return 1; 1013 932 1014 933 return 1; 1015 934 } ··· 1105 1012 static inline int 1106 1013 access_error(unsigned long error_code, struct vm_area_struct *vma) 1107 1014 { 1015 + /* This is only called for the current mm, so: */ 1016 + bool foreign = false; 1017 + /* 1018 + * Make sure to check the VMA so that we do not perform 1019 + * faults just to hit a PF_PK as soon as we fill in a 1020 + * page. 1021 + */ 1022 + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), 1023 + (error_code & PF_INSTR), foreign)) 1024 + return 1; 1025 + 1108 1026 if (error_code & PF_WRITE) { 1109 1027 /* write, present and write, not present: */ 1110 1028 if (unlikely(!(vma->vm_flags & VM_WRITE))) ··· 1222 1118 * Don't take the mm semaphore here. If we fixup a prefetch 1223 1119 * fault we could otherwise deadlock: 1224 1120 */ 1225 - bad_area_nosemaphore(regs, error_code, address); 1121 + bad_area_nosemaphore(regs, error_code, address, NULL); 1226 1122 1227 1123 return; 1228 1124 } ··· 1235 1131 pgtable_bad(regs, error_code, address); 1236 1132 1237 1133 if (unlikely(smap_violation(error_code, regs))) { 1238 - bad_area_nosemaphore(regs, error_code, address); 1134 + bad_area_nosemaphore(regs, error_code, address, NULL); 1239 1135 return; 1240 1136 } 1241 1137 ··· 1244 1140 * in a region with pagefaults disabled then we must not take the fault 1245 1141 */ 1246 1142 if (unlikely(faulthandler_disabled() || !mm)) { 1247 - bad_area_nosemaphore(regs, error_code, address); 1143 + bad_area_nosemaphore(regs, error_code, address, NULL); 1248 1144 return; 1249 1145 } 1250 1146 ··· 1268 1164 1269 1165 if (error_code & PF_WRITE) 1270 1166 flags |= FAULT_FLAG_WRITE; 1167 + if (error_code & PF_INSTR) 1168 + flags |= FAULT_FLAG_INSTRUCTION; 1271 1169 1272 1170 /* 1273 1171 * When running in the kernel we expect faults to occur only to ··· 1290 1184 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1291 1185 if ((error_code & PF_USER) == 0 && 1292 1186 !search_exception_tables(regs->ip)) { 1293 - bad_area_nosemaphore(regs, error_code, address); 1187 + bad_area_nosemaphore(regs, error_code, address, NULL); 1294 1188 return; 1295 1189 } 1296 1190 retry: ··· 1338 1232 */ 1339 1233 good_area: 1340 1234 if (unlikely(access_error(error_code, vma))) { 1341 - bad_area_access_error(regs, error_code, address); 1235 + bad_area_access_error(regs, error_code, address, vma); 1342 1236 return; 1343 1237 } 1344 1238 ··· 1376 1270 1377 1271 up_read(&mm->mmap_sem); 1378 1272 if (unlikely(fault & VM_FAULT_ERROR)) { 1379 - mm_fault_error(regs, error_code, address, fault); 1273 + mm_fault_error(regs, error_code, address, vma, fault); 1380 1274 return; 1381 1275 } 1382 1276

+28 -17

arch/x86/mm/gup.c

··· 11 11 #include <linux/swap.h> 12 12 #include <linux/memremap.h> 13 13 14 + #include <asm/mmu_context.h> 14 15 #include <asm/pgtable.h> 15 16 16 17 static inline pte_t gup_get_pte(pte_t *ptep) ··· 76 75 } 77 76 78 77 /* 78 + * 'pteval' can come from a pte, pmd or pud. We only check 79 + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the 80 + * same value on all 3 types. 81 + */ 82 + static inline int pte_allows_gup(unsigned long pteval, int write) 83 + { 84 + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; 85 + 86 + if (write) 87 + need_pte_bits |= _PAGE_RW; 88 + 89 + if ((pteval & need_pte_bits) != need_pte_bits) 90 + return 0; 91 + 92 + /* Check memory protection keys permissions. */ 93 + if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) 94 + return 0; 95 + 96 + return 1; 97 + } 98 + 99 + /* 79 100 * The performance critical leaf functions are made noinline otherwise gcc 80 101 * inlines everything into a single function which results in too much 81 102 * register pressure. ··· 106 83 unsigned long end, int write, struct page **pages, int *nr) 107 84 { 108 85 struct dev_pagemap *pgmap = NULL; 109 - unsigned long mask; 110 86 int nr_start = *nr; 111 87 pte_t *ptep; 112 - 113 - mask = _PAGE_PRESENT|_PAGE_USER; 114 - if (write) 115 - mask |= _PAGE_RW; 116 88 117 89 ptep = pte_offset_map(&pmd, addr); 118 90 do { ··· 127 109 pte_unmap(ptep); 128 110 return 0; 129 111 } 130 - } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 112 + } else if (!pte_allows_gup(pte_val(pte), write) || 113 + pte_special(pte)) { 131 114 pte_unmap(ptep); 132 115 return 0; 133 116 } ··· 183 164 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 184 165 unsigned long end, int write, struct page **pages, int *nr) 185 166 { 186 - unsigned long mask; 187 167 struct page *head, *page; 188 168 int refs; 189 169 190 - mask = _PAGE_PRESENT|_PAGE_USER; 191 - if (write) 192 - mask |= _PAGE_RW; 193 - if ((pmd_flags(pmd) & mask) != mask) 170 + if (!pte_allows_gup(pmd_val(pmd), write)) 194 171 return 0; 195 172 196 173 VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); ··· 246 231 static noinline int gup_huge_pud(pud_t pud, unsigned long addr, 247 232 unsigned long end, int write, struct page **pages, int *nr) 248 233 { 249 - unsigned long mask; 250 234 struct page *head, *page; 251 235 int refs; 252 236 253 - mask = _PAGE_PRESENT|_PAGE_USER; 254 - if (write) 255 - mask |= _PAGE_RW; 256 - if ((pud_flags(pud) & mask) != mask) 237 + if (!pte_allows_gup(pud_val(pud), write)) 257 238 return 0; 258 239 /* hugepages are never "special" */ 259 240 VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); ··· 433 422 start += nr << PAGE_SHIFT; 434 423 pages += nr; 435 424 436 - ret = get_user_pages_unlocked(current, mm, start, 425 + ret = get_user_pages_unlocked(start, 437 426 (end - start) >> PAGE_SHIFT, 438 427 write, 0, pages); 439 428

+2 -2

arch/x86/mm/mpx.c

··· 546 546 int nr_pages = 1; 547 547 int force = 0; 548 548 549 - gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, 550 - nr_pages, write, force, NULL, NULL); 549 + gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, 550 + force, NULL, NULL); 551 551 /* 552 552 * get_user_pages() returns number of pages gotten. 553 553 * 0 means we failed to fault in and get anything,

+101

arch/x86/mm/pkeys.c

··· 1 + /* 2 + * Intel Memory Protection Keys management 3 + * Copyright (c) 2015, Intel Corporation. 4 + * 5 + * This program is free software; you can redistribute it and/or modify it 6 + * under the terms and conditions of the GNU General Public License, 7 + * version 2, as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope it will be useful, but WITHOUT 10 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 + * more details. 13 + */ 14 + #include <linux/mm_types.h> /* mm_struct, vma, etc... */ 15 + #include <linux/pkeys.h> /* PKEY_* */ 16 + #include <uapi/asm-generic/mman-common.h> 17 + 18 + #include <asm/cpufeature.h> /* boot_cpu_has, ... */ 19 + #include <asm/mmu_context.h> /* vma_pkey() */ 20 + #include <asm/fpu/internal.h> /* fpregs_active() */ 21 + 22 + int __execute_only_pkey(struct mm_struct *mm) 23 + { 24 + int ret; 25 + 26 + /* 27 + * We do not want to go through the relatively costly 28 + * dance to set PKRU if we do not need to. Check it 29 + * first and assume that if the execute-only pkey is 30 + * write-disabled that we do not have to set it 31 + * ourselves. We need preempt off so that nobody 32 + * can make fpregs inactive. 33 + */ 34 + preempt_disable(); 35 + if (fpregs_active() && 36 + !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { 37 + preempt_enable(); 38 + return PKEY_DEDICATED_EXECUTE_ONLY; 39 + } 40 + preempt_enable(); 41 + ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, 42 + PKEY_DISABLE_ACCESS); 43 + /* 44 + * If the PKRU-set operation failed somehow, just return 45 + * 0 and effectively disable execute-only support. 46 + */ 47 + if (ret) 48 + return 0; 49 + 50 + return PKEY_DEDICATED_EXECUTE_ONLY; 51 + } 52 + 53 + static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) 54 + { 55 + /* Do this check first since the vm_flags should be hot */ 56 + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) 57 + return false; 58 + if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) 59 + return false; 60 + 61 + return true; 62 + } 63 + 64 + /* 65 + * This is only called for *plain* mprotect calls. 66 + */ 67 + int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) 68 + { 69 + /* 70 + * Is this an mprotect_pkey() call? If so, never 71 + * override the value that came from the user. 72 + */ 73 + if (pkey != -1) 74 + return pkey; 75 + /* 76 + * Look for a protection-key-drive execute-only mapping 77 + * which is now being given permissions that are not 78 + * execute-only. Move it back to the default pkey. 79 + */ 80 + if (vma_is_pkey_exec_only(vma) && 81 + (prot & (PROT_READ|PROT_WRITE))) { 82 + return 0; 83 + } 84 + /* 85 + * The mapping is execute-only. Go try to get the 86 + * execute-only protection key. If we fail to do that, 87 + * fall through as if we do not have execute-only 88 + * support. 89 + */ 90 + if (prot == PROT_EXEC) { 91 + pkey = execute_only_pkey(vma->vm_mm); 92 + if (pkey > 0) 93 + return pkey; 94 + } 95 + /* 96 + * This is a vanilla, non-pkey mprotect (or we failed to 97 + * setup execute-only), inherit the pkey from the VMA we 98 + * are working on. 99 + */ 100 + return vma_pkey(vma); 101 + }

+1 -1

drivers/char/agp/frontend.c

··· 156 156 { 157 157 unsigned long prot_bits; 158 158 159 - prot_bits = calc_vm_prot_bits(prot) | VM_SHARED; 159 + prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED; 160 160 return vm_get_page_prot(prot_bits); 161 161 } 162 162

+1 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

··· 518 518 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; 519 519 struct page **pages = ttm->pages + pinned; 520 520 521 - r = get_user_pages(current, current->mm, userptr, num_pages, 522 - write, 0, pages, NULL); 521 + r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); 523 522 if (r < 0) 524 523 goto release_pages; 525 524

+3 -3

drivers/gpu/drm/etnaviv/etnaviv_gem.c

··· 753 753 754 754 down_read(&mm->mmap_sem); 755 755 while (pinned < npages) { 756 - ret = get_user_pages(task, mm, ptr, npages - pinned, 757 - !etnaviv_obj->userptr.ro, 0, 758 - pvec + pinned, NULL); 756 + ret = get_user_pages_remote(task, mm, ptr, npages - pinned, 757 + !etnaviv_obj->userptr.ro, 0, 758 + pvec + pinned, NULL); 759 759 if (ret < 0) 760 760 break; 761 761

+5 -5

drivers/gpu/drm/i915/i915_gem_userptr.c

··· 584 584 585 585 down_read(&mm->mmap_sem); 586 586 while (pinned < npages) { 587 - ret = get_user_pages(work->task, mm, 588 - obj->userptr.ptr + pinned * PAGE_SIZE, 589 - npages - pinned, 590 - !obj->userptr.read_only, 0, 591 - pvec + pinned, NULL); 587 + ret = get_user_pages_remote(work->task, mm, 588 + obj->userptr.ptr + pinned * PAGE_SIZE, 589 + npages - pinned, 590 + !obj->userptr.read_only, 0, 591 + pvec + pinned, NULL); 592 592 if (ret < 0) 593 593 break; 594 594

+1 -2

drivers/gpu/drm/radeon/radeon_ttm.c

··· 554 554 uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; 555 555 struct page **pages = ttm->pages + pinned; 556 556 557 - r = get_user_pages(current, current->mm, userptr, num_pages, 558 - write, 0, pages, NULL); 557 + r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); 559 558 if (r < 0) 560 559 goto release_pages; 561 560

+1 -2

drivers/gpu/drm/via/via_dmablit.c

··· 239 239 if (NULL == vsg->pages) 240 240 return -ENOMEM; 241 241 down_read(&current->mm->mmap_sem); 242 - ret = get_user_pages(current, current->mm, 243 - (unsigned long)xfer->mem_addr, 242 + ret = get_user_pages((unsigned long)xfer->mem_addr, 244 243 vsg->num_pages, 245 244 (vsg->direction == DMA_FROM_DEVICE), 246 245 0, vsg->pages, NULL);

+1 -1

drivers/infiniband/core/umem.c

··· 188 188 sg_list_start = umem->sg_head.sgl; 189 189 190 190 while (npages) { 191 - ret = get_user_pages(current, current->mm, cur_base, 191 + ret = get_user_pages(cur_base, 192 192 min_t(unsigned long, npages, 193 193 PAGE_SIZE / sizeof (struct page *)), 194 194 1, !umem->writable, page_list, vma_list);

+4 -4

drivers/infiniband/core/umem_odp.c

··· 572 572 * complex (and doesn't gain us much performance in most use 573 573 * cases). 574 574 */ 575 - npages = get_user_pages(owning_process, owning_mm, user_virt, 576 - gup_num_pages, 577 - access_mask & ODP_WRITE_ALLOWED_BIT, 0, 578 - local_page_list, NULL); 575 + npages = get_user_pages_remote(owning_process, owning_mm, 576 + user_virt, gup_num_pages, 577 + access_mask & ODP_WRITE_ALLOWED_BIT, 578 + 0, local_page_list, NULL); 579 579 up_read(&owning_mm->mmap_sem); 580 580 581 581 if (npages < 0)

+1 -2

drivers/infiniband/hw/mthca/mthca_memfree.c

··· 472 472 goto out; 473 473 } 474 474 475 - ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, 476 - pages, NULL); 475 + ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL); 477 476 if (ret < 0) 478 477 goto out; 479 478

+1 -2

drivers/infiniband/hw/qib/qib_user_pages.c

··· 66 66 } 67 67 68 68 for (got = 0; got < num_pages; got += ret) { 69 - ret = get_user_pages(current, current->mm, 70 - start_page + got * PAGE_SIZE, 69 + ret = get_user_pages(start_page + got * PAGE_SIZE, 71 70 num_pages - got, 1, 1, 72 71 p + got, NULL); 73 72 if (ret < 0)

+1 -1

drivers/infiniband/hw/usnic/usnic_uiom.c

··· 144 144 ret = 0; 145 145 146 146 while (npages) { 147 - ret = get_user_pages(current, current->mm, cur_base, 147 + ret = get_user_pages(cur_base, 148 148 min_t(unsigned long, npages, 149 149 PAGE_SIZE / sizeof(struct page *)), 150 150 1, !writable, page_list, NULL);

+1

drivers/iommu/amd_iommu_v2.c

··· 526 526 flags |= FAULT_FLAG_USER; 527 527 if (fault->flags & PPR_FAULT_WRITE) 528 528 flags |= FAULT_FLAG_WRITE; 529 + flags |= FAULT_FLAG_REMOTE; 529 530 530 531 down_read(&mm->mmap_sem); 531 532 vma = find_extend_vma(mm, address);

+2 -2

drivers/media/pci/ivtv/ivtv-udma.c

··· 124 124 } 125 125 126 126 /* Get user pages for DMA Xfer */ 127 - err = get_user_pages_unlocked(current, current->mm, 128 - user_dma.uaddr, user_dma.page_count, 0, 1, dma->map); 127 + err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0, 128 + 1, dma->map); 129 129 130 130 if (user_dma.page_count != err) { 131 131 IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",

+4 -6

drivers/media/pci/ivtv/ivtv-yuv.c

··· 75 75 ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); 76 76 77 77 /* Get user pages for DMA Xfer */ 78 - y_pages = get_user_pages_unlocked(current, current->mm, 79 - y_dma.uaddr, y_dma.page_count, 0, 1, 80 - &dma->map[0]); 78 + y_pages = get_user_pages_unlocked(y_dma.uaddr, 79 + y_dma.page_count, 0, 1, &dma->map[0]); 81 80 uv_pages = 0; /* silence gcc. value is set and consumed only if: */ 82 81 if (y_pages == y_dma.page_count) { 83 - uv_pages = get_user_pages_unlocked(current, current->mm, 84 - uv_dma.uaddr, uv_dma.page_count, 0, 1, 85 - &dma->map[y_pages]); 82 + uv_pages = get_user_pages_unlocked(uv_dma.uaddr, 83 + uv_dma.page_count, 0, 1, &dma->map[y_pages]); 86 84 } 87 85 88 86 if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) {

+1 -2

drivers/media/v4l2-core/videobuf-dma-sg.c

··· 181 181 dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", 182 182 data, size, dma->nr_pages); 183 183 184 - err = get_user_pages(current, current->mm, 185 - data & PAGE_MASK, dma->nr_pages, 184 + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, 186 185 rw == READ, 1, /* force */ 187 186 dma->pages, NULL); 188 187

-2

drivers/misc/mic/scif/scif_rma.c

··· 1394 1394 } 1395 1395 1396 1396 pinned_pages->nr_pages = get_user_pages( 1397 - current, 1398 - mm, 1399 1397 (u64)addr, 1400 1398 nr_pages, 1401 1399 !!(prot & SCIF_PROT_WRITE),

+1 -2

drivers/misc/sgi-gru/grufault.c

··· 198 198 #else 199 199 *pageshift = PAGE_SHIFT; 200 200 #endif 201 - if (get_user_pages 202 - (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0) 201 + if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0) 203 202 return -EFAULT; 204 203 *paddr = page_to_phys(page); 205 204 put_page(page);

-2

drivers/scsi/st.c

··· 4917 4917 /* Try to fault in all of the necessary pages */ 4918 4918 /* rw==READ means read from drive, write into memory area */ 4919 4919 res = get_user_pages_unlocked( 4920 - current, 4921 - current->mm, 4922 4920 uaddr, 4923 4921 nr_pages, 4924 4922 rw == READ,

+2 -2

drivers/staging/android/ashmem.c

··· 385 385 } 386 386 387 387 /* requested protection bits must match our allowed protection mask */ 388 - if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) & 389 - calc_vm_prot_bits(PROT_MASK))) { 388 + if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) & 389 + calc_vm_prot_bits(PROT_MASK, 0))) { 390 390 ret = -EPERM; 391 391 goto out; 392 392 }

+2 -2

drivers/video/fbdev/pvr2fb.c

··· 686 686 if (!pages) 687 687 return -ENOMEM; 688 688 689 - ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf, 690 - nr_pages, WRITE, 0, pages); 689 + ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE, 690 + 0, pages); 691 691 692 692 if (ret < nr_pages) { 693 693 nr_pages = ret;

+2 -3

drivers/virt/fsl_hypervisor.c

··· 244 244 245 245 /* Get the physical addresses of the source buffer */ 246 246 down_read(&current->mm->mmap_sem); 247 - num_pinned = get_user_pages(current, current->mm, 248 - param.local_vaddr - lb_offset, num_pages, 249 - (param.source == -1) ? READ : WRITE, 247 + num_pinned = get_user_pages(param.local_vaddr - lb_offset, 248 + num_pages, (param.source == -1) ? READ : WRITE, 250 249 0, pages, NULL); 251 250 up_read(&current->mm->mmap_sem); 252 251

+6 -2

fs/exec.c

··· 199 199 return NULL; 200 200 } 201 201 #endif 202 - ret = get_user_pages(current, bprm->mm, pos, 203 - 1, write, 1, &page, NULL); 202 + /* 203 + * We are doing an exec(). 'current' is the process 204 + * doing the exec and bprm->mm is the new process's mm. 205 + */ 206 + ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, 207 + 1, &page, NULL); 204 208 if (ret <= 0) 205 209 return NULL; 206 210

+14

fs/proc/task_mmu.c

··· 660 660 [ilog2(VM_MERGEABLE)] = "mg", 661 661 [ilog2(VM_UFFD_MISSING)]= "um", 662 662 [ilog2(VM_UFFD_WP)] = "uw", 663 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 664 + /* These come out via ProtectionKey: */ 665 + [ilog2(VM_PKEY_BIT0)] = "", 666 + [ilog2(VM_PKEY_BIT1)] = "", 667 + [ilog2(VM_PKEY_BIT2)] = "", 668 + [ilog2(VM_PKEY_BIT3)] = "", 669 + #endif 663 670 }; 664 671 size_t i; 665 672 666 673 seq_puts(m, "VmFlags: "); 667 674 for (i = 0; i < BITS_PER_LONG; i++) { 675 + if (!mnemonics[i][0]) 676 + continue; 668 677 if (vma->vm_flags & (1UL << i)) { 669 678 seq_printf(m, "%c%c ", 670 679 mnemonics[i][0], mnemonics[i][1]); ··· 710 701 return 0; 711 702 } 712 703 #endif /* HUGETLB_PAGE */ 704 + 705 + void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) 706 + { 707 + } 713 708 714 709 static int show_smap(struct seq_file *m, void *v, int is_pid) 715 710 { ··· 796 783 (vma->vm_flags & VM_LOCKED) ? 797 784 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 798 785 786 + arch_show_smap(m, vma); 799 787 show_smap_vma_flags(m, vma); 800 788 m_cache_vma(m, vma); 801 789 return 0;

+12

include/asm-generic/mm_hooks.h

··· 26 26 { 27 27 } 28 28 29 + static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, 30 + bool write, bool execute, bool foreign) 31 + { 32 + /* by default, allow everything */ 33 + return true; 34 + } 35 + 36 + static inline bool arch_pte_access_permitted(pte_t pte, bool write) 37 + { 38 + /* by default, allow everything */ 39 + return true; 40 + } 29 41 #endif /* _ASM_GENERIC_MM_HOOKS_H */

+89 -10

include/linux/mm.h

··· 193 193 #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ 194 194 #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 195 195 196 + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS 197 + #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ 198 + #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ 199 + #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ 200 + #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ 201 + #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) 202 + #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) 203 + #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) 204 + #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) 205 + #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ 206 + 196 207 #if defined(CONFIG_X86) 197 208 # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ 209 + #if defined (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) 210 + # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 211 + # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ 212 + # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 213 + # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 214 + # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 215 + #endif 198 216 #elif defined(CONFIG_PPC) 199 217 # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ 200 218 #elif defined(CONFIG_PARISC) ··· 274 256 #define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ 275 257 #define FAULT_FLAG_TRIED 0x20 /* Second try */ 276 258 #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ 259 + #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ 260 + #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ 277 261 278 262 /* 279 263 * vm_fault is filled by the the pagefault handler and passed to the vma's ··· 1244 1224 unsigned long start, unsigned long nr_pages, 1245 1225 unsigned int foll_flags, struct page **pages, 1246 1226 struct vm_area_struct **vmas, int *nonblocking); 1247 - long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1248 - unsigned long start, unsigned long nr_pages, 1249 - int write, int force, struct page **pages, 1250 - struct vm_area_struct **vmas); 1251 - long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 1252 - unsigned long start, unsigned long nr_pages, 1253 - int write, int force, struct page **pages, 1254 - int *locked); 1227 + long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1228 + unsigned long start, unsigned long nr_pages, 1229 + int write, int force, struct page **pages, 1230 + struct vm_area_struct **vmas); 1231 + long get_user_pages6(unsigned long start, unsigned long nr_pages, 1232 + int write, int force, struct page **pages, 1233 + struct vm_area_struct **vmas); 1234 + long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, 1235 + int write, int force, struct page **pages, int *locked); 1255 1236 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 1256 1237 unsigned long start, unsigned long nr_pages, 1257 1238 int write, int force, struct page **pages, 1258 1239 unsigned int gup_flags); 1259 - long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 1260 - unsigned long start, unsigned long nr_pages, 1240 + long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, 1261 1241 int write, int force, struct page **pages); 1262 1242 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1263 1243 struct page **pages); 1244 + 1245 + /* suppress warnings from use in EXPORT_SYMBOL() */ 1246 + #ifndef __DISABLE_GUP_DEPRECATED 1247 + #define __gup_deprecated __deprecated 1248 + #else 1249 + #define __gup_deprecated 1250 + #endif 1251 + /* 1252 + * These macros provide backward-compatibility with the old 1253 + * get_user_pages() variants which took tsk/mm. These 1254 + * functions/macros provide both compile-time __deprecated so we 1255 + * can catch old-style use and not break the build. The actual 1256 + * functions also have WARN_ON()s to let us know at runtime if 1257 + * the get_user_pages() should have been the "remote" variant. 1258 + * 1259 + * These are hideous, but temporary. 1260 + * 1261 + * If you run into one of these __deprecated warnings, look 1262 + * at how you are calling get_user_pages(). If you are calling 1263 + * it with current/current->mm as the first two arguments, 1264 + * simply remove those arguments. The behavior will be the same 1265 + * as it is now. If you are calling it on another task, use 1266 + * get_user_pages_remote() instead. 1267 + * 1268 + * Any questions? Ask Dave Hansen <dave@sr71.net> 1269 + */ 1270 + long 1271 + __gup_deprecated 1272 + get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, 1273 + unsigned long start, unsigned long nr_pages, 1274 + int write, int force, struct page **pages, 1275 + struct vm_area_struct **vmas); 1276 + #define GUP_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages, ...) \ 1277 + get_user_pages 1278 + #define get_user_pages(...) GUP_MACRO(__VA_ARGS__, \ 1279 + get_user_pages8, x, \ 1280 + get_user_pages6, x, x, x, x, x)(__VA_ARGS__) 1281 + 1282 + __gup_deprecated 1283 + long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, 1284 + unsigned long start, unsigned long nr_pages, 1285 + int write, int force, struct page **pages, 1286 + int *locked); 1287 + #define GUPL_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages_locked, ...) \ 1288 + get_user_pages_locked 1289 + #define get_user_pages_locked(...) GUPL_MACRO(__VA_ARGS__, \ 1290 + get_user_pages_locked8, x, \ 1291 + get_user_pages_locked6, x, x, x, x)(__VA_ARGS__) 1292 + 1293 + __gup_deprecated 1294 + long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, 1295 + unsigned long start, unsigned long nr_pages, 1296 + int write, int force, struct page **pages); 1297 + #define GUPU_MACRO(_1, _2, _3, _4, _5, _6, _7, get_user_pages_unlocked, ...) \ 1298 + get_user_pages_unlocked 1299 + #define get_user_pages_unlocked(...) GUPU_MACRO(__VA_ARGS__, \ 1300 + get_user_pages_unlocked7, x, \ 1301 + get_user_pages_unlocked5, x, x, x, x)(__VA_ARGS__) 1264 1302 1265 1303 /* Container for pinned pfns / pages */ 1266 1304 struct frame_vector { ··· 2247 2169 #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 2248 2170 #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ 2249 2171 #define FOLL_MLOCK 0x1000 /* lock present pages */ 2172 + #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ 2250 2173 2251 2174 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 2252 2175 void *data);

+3 -3

include/linux/mman.h

··· 35 35 */ 36 36 37 37 #ifndef arch_calc_vm_prot_bits 38 - #define arch_calc_vm_prot_bits(prot) 0 38 + #define arch_calc_vm_prot_bits(prot, pkey) 0 39 39 #endif 40 40 41 41 #ifndef arch_vm_get_page_prot ··· 70 70 * Combine the mmap "prot" argument into "vm_flags" used internally. 71 71 */ 72 72 static inline unsigned long 73 - calc_vm_prot_bits(unsigned long prot) 73 + calc_vm_prot_bits(unsigned long prot, unsigned long pkey) 74 74 { 75 75 return _calc_vm_trans(prot, PROT_READ, VM_READ ) | 76 76 _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | 77 77 _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | 78 - arch_calc_vm_prot_bits(prot); 78 + arch_calc_vm_prot_bits(prot, pkey); 79 79 } 80 80 81 81 /*

+33

include/linux/pkeys.h

··· 1 + #ifndef _LINUX_PKEYS_H 2 + #define _LINUX_PKEYS_H 3 + 4 + #include <linux/mm_types.h> 5 + #include <asm/mmu_context.h> 6 + 7 + #define PKEY_DISABLE_ACCESS 0x1 8 + #define PKEY_DISABLE_WRITE 0x2 9 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 10 + PKEY_DISABLE_WRITE) 11 + 12 + #ifdef CONFIG_ARCH_HAS_PKEYS 13 + #include <asm/pkeys.h> 14 + #else /* ! CONFIG_ARCH_HAS_PKEYS */ 15 + #define arch_max_pkey() (1) 16 + #define execute_only_pkey(mm) (0) 17 + #define arch_override_mprotect_pkey(vma, prot, pkey) (0) 18 + #define PKEY_DEDICATED_EXECUTE_ONLY 0 19 + #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 20 + 21 + /* 22 + * This is called from mprotect_pkey(). 23 + * 24 + * Returns true if the protection keys is valid. 25 + */ 26 + static inline bool validate_pkey(int pkey) 27 + { 28 + if (pkey < 0) 29 + return false; 30 + return (pkey < arch_max_pkey()); 31 + } 32 + 33 + #endif /* _LINUX_PKEYS_H */

+12 -5

include/uapi/asm-generic/siginfo.h

··· 91 91 int _trapno; /* TRAP # which caused the signal */ 92 92 #endif 93 93 short _addr_lsb; /* LSB of the reported address */ 94 - struct { 95 - void __user *_lower; 96 - void __user *_upper; 97 - } _addr_bnd; 94 + union { 95 + /* used when si_code=SEGV_BNDERR */ 96 + struct { 97 + void __user *_lower; 98 + void __user *_upper; 99 + } _addr_bnd; 100 + /* used when si_code=SEGV_PKUERR */ 101 + __u32 _pkey; 102 + }; 98 103 } _sigfault; 99 104 100 105 /* SIGPOLL */ ··· 142 137 #define si_addr_lsb _sifields._sigfault._addr_lsb 143 138 #define si_lower _sifields._sigfault._addr_bnd._lower 144 139 #define si_upper _sifields._sigfault._addr_bnd._upper 140 + #define si_pkey _sifields._sigfault._pkey 145 141 #define si_band _sifields._sigpoll._band 146 142 #define si_fd _sifields._sigpoll._fd 147 143 #ifdef __ARCH_SIGSYS ··· 212 206 #define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ 213 207 #define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ 214 208 #define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ 215 - #define NSIGSEGV 3 209 + #define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */ 210 + #define NSIGSEGV 4 216 211 217 212 /* 218 213 * SIGBUS si_codes

+8 -2

kernel/events/uprobes.c

··· 299 299 300 300 retry: 301 301 /* Read the page with vaddr into memory */ 302 - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); 302 + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); 303 303 if (ret <= 0) 304 304 return ret; 305 305 ··· 1701 1701 if (likely(result == 0)) 1702 1702 goto out; 1703 1703 1704 - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); 1704 + /* 1705 + * The NULL 'tsk' here ensures that any faults that occur here 1706 + * will not be accounted to the task. 'mm' *is* current->mm, 1707 + * but we treat this as a 'remote' access since it is 1708 + * essentially a kernel access to the memory. 1709 + */ 1710 + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); 1705 1711 if (result < 0) 1706 1712 return result; 1707 1713

+4

kernel/signal.c

··· 2709 2709 err |= __put_user(from->si_upper, &to->si_upper); 2710 2710 } 2711 2711 #endif 2712 + #ifdef SEGV_PKUERR 2713 + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) 2714 + err |= __put_user(from->si_pkey, &to->si_pkey); 2715 + #endif 2712 2716 break; 2713 2717 case __SI_CHLD: 2714 2718 err |= __put_user(from->si_pid, &to->si_pid);

+5

mm/Kconfig

··· 667 667 668 668 config FRAME_VECTOR 669 669 bool 670 + 671 + config ARCH_USES_HIGH_VMA_FLAGS 672 + bool 673 + config ARCH_HAS_PKEYS 674 + bool

+1 -1

mm/frame_vector.c

··· 58 58 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { 59 59 vec->got_ref = true; 60 60 vec->is_pfns = false; 61 - ret = get_user_pages_locked(current, mm, start, nr_frames, 61 + ret = get_user_pages_locked(start, nr_frames, 62 62 write, force, (struct page **)(vec->ptrs), &locked); 63 63 goto out; 64 64 }

+107 -20

mm/gup.c

··· 1 + #define __DISABLE_GUP_DEPRECATED 1 1 2 #include <linux/kernel.h> 2 3 #include <linux/errno.h> 3 4 #include <linux/err.h> ··· 15 14 #include <linux/rwsem.h> 16 15 #include <linux/hugetlb.h> 17 16 17 + #include <asm/mmu_context.h> 18 18 #include <asm/pgtable.h> 19 19 #include <asm/tlbflush.h> 20 20 ··· 365 363 return -ENOENT; 366 364 if (*flags & FOLL_WRITE) 367 365 fault_flags |= FAULT_FLAG_WRITE; 366 + if (*flags & FOLL_REMOTE) 367 + fault_flags |= FAULT_FLAG_REMOTE; 368 368 if (nonblocking) 369 369 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 370 370 if (*flags & FOLL_NOWAIT) ··· 417 413 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 418 414 { 419 415 vm_flags_t vm_flags = vma->vm_flags; 416 + int write = (gup_flags & FOLL_WRITE); 417 + int foreign = (gup_flags & FOLL_REMOTE); 420 418 421 419 if (vm_flags & (VM_IO | VM_PFNMAP)) 422 420 return -EFAULT; 423 421 424 - if (gup_flags & FOLL_WRITE) { 422 + if (write) { 425 423 if (!(vm_flags & VM_WRITE)) { 426 424 if (!(gup_flags & FOLL_FORCE)) 427 425 return -EFAULT; ··· 449 443 if (!(vm_flags & VM_MAYREAD)) 450 444 return -EFAULT; 451 445 } 446 + /* 447 + * gups are always data accesses, not instruction 448 + * fetches, so execute=false here 449 + */ 450 + if (!arch_vma_access_permitted(vma, write, false, foreign)) 451 + return -EFAULT; 452 452 return 0; 453 453 } 454 454 ··· 621 609 } 622 610 EXPORT_SYMBOL(__get_user_pages); 623 611 612 + bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) 613 + { 614 + bool write = !!(fault_flags & FAULT_FLAG_WRITE); 615 + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 616 + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 617 + 618 + if (!(vm_flags & vma->vm_flags)) 619 + return false; 620 + 621 + /* 622 + * The architecture might have a hardware protection 623 + * mechanism other than read/write that can deny access. 624 + * 625 + * gup always represents data access, not instruction 626 + * fetches, so execute=false here: 627 + */ 628 + if (!arch_vma_access_permitted(vma, write, false, foreign)) 629 + return false; 630 + 631 + return true; 632 + } 633 + 624 634 /* 625 635 * fixup_user_fault() - manually resolve a user page fault 626 636 * @tsk: the task_struct to use for page fault accounting, or ··· 678 644 bool *unlocked) 679 645 { 680 646 struct vm_area_struct *vma; 681 - vm_flags_t vm_flags; 682 647 int ret, major = 0; 683 648 684 649 if (unlocked) ··· 688 655 if (!vma || address < vma->vm_start) 689 656 return -EFAULT; 690 657 691 - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; 692 - if (!(vm_flags & vma->vm_flags)) 658 + if (!vma_permits_fault(vma, fault_flags)) 693 659 return -EFAULT; 694 660 695 661 ret = handle_mm_fault(mm, vma, address, fault_flags); ··· 839 807 * if (locked) 840 808 * up_read(&mm->mmap_sem); 841 809 */ 842 - long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 843 - unsigned long start, unsigned long nr_pages, 810 + long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, 844 811 int write, int force, struct page **pages, 845 812 int *locked) 846 813 { 847 - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, 848 - pages, NULL, locked, true, FOLL_TOUCH); 814 + return __get_user_pages_locked(current, current->mm, start, nr_pages, 815 + write, force, pages, NULL, locked, true, 816 + FOLL_TOUCH); 849 817 } 850 - EXPORT_SYMBOL(get_user_pages_locked); 818 + EXPORT_SYMBOL(get_user_pages_locked6); 851 819 852 820 /* 853 821 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to ··· 892 860 * or if "force" shall be set to 1 (get_user_pages_fast misses the 893 861 * "force" parameter). 894 862 */ 895 - long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 896 - unsigned long start, unsigned long nr_pages, 863 + long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, 897 864 int write, int force, struct page **pages) 898 865 { 899 - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, 900 - force, pages, FOLL_TOUCH); 866 + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, 867 + write, force, pages, FOLL_TOUCH); 901 868 } 902 - EXPORT_SYMBOL(get_user_pages_unlocked); 869 + EXPORT_SYMBOL(get_user_pages_unlocked5); 903 870 904 871 /* 905 - * get_user_pages() - pin user pages in memory 872 + * get_user_pages_remote() - pin user pages in memory 906 873 * @tsk: the task_struct to use for page fault accounting, or 907 874 * NULL if faults are not to be recorded. 908 875 * @mm: mm_struct of target mm ··· 955 924 * should use get_user_pages because it cannot pass 956 925 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 957 926 */ 958 - long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 959 - unsigned long start, unsigned long nr_pages, int write, 960 - int force, struct page **pages, struct vm_area_struct **vmas) 927 + long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 928 + unsigned long start, unsigned long nr_pages, 929 + int write, int force, struct page **pages, 930 + struct vm_area_struct **vmas) 961 931 { 962 932 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, 963 - pages, vmas, NULL, false, FOLL_TOUCH); 933 + pages, vmas, NULL, false, 934 + FOLL_TOUCH | FOLL_REMOTE); 964 935 } 965 - EXPORT_SYMBOL(get_user_pages); 936 + EXPORT_SYMBOL(get_user_pages_remote); 937 + 938 + /* 939 + * This is the same as get_user_pages_remote(), just with a 940 + * less-flexible calling convention where we assume that the task 941 + * and mm being operated on are the current task's. We also 942 + * obviously don't pass FOLL_REMOTE in here. 943 + */ 944 + long get_user_pages6(unsigned long start, unsigned long nr_pages, 945 + int write, int force, struct page **pages, 946 + struct vm_area_struct **vmas) 947 + { 948 + return __get_user_pages_locked(current, current->mm, start, nr_pages, 949 + write, force, pages, vmas, NULL, false, 950 + FOLL_TOUCH); 951 + } 952 + EXPORT_SYMBOL(get_user_pages6); 966 953 967 954 /** 968 955 * populate_vma_page_range() - populate a range of pages in the vma. ··· 1191 1142 */ 1192 1143 if (!pte_present(pte) || pte_special(pte) || 1193 1144 pte_protnone(pte) || (write && !pte_write(pte))) 1145 + goto pte_unmap; 1146 + 1147 + if (!arch_pte_access_permitted(pte, write)) 1194 1148 goto pte_unmap; 1195 1149 1196 1150 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); ··· 1519 1467 } 1520 1468 1521 1469 #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ 1470 + 1471 + long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, 1472 + unsigned long start, unsigned long nr_pages, 1473 + int write, int force, struct page **pages, 1474 + struct vm_area_struct **vmas) 1475 + { 1476 + WARN_ONCE(tsk != current, "get_user_pages() called on remote task"); 1477 + WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm"); 1478 + 1479 + return get_user_pages6(start, nr_pages, write, force, pages, vmas); 1480 + } 1481 + EXPORT_SYMBOL(get_user_pages8); 1482 + 1483 + long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, 1484 + unsigned long start, unsigned long nr_pages, 1485 + int write, int force, struct page **pages, int *locked) 1486 + { 1487 + WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task"); 1488 + WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm"); 1489 + 1490 + return get_user_pages_locked6(start, nr_pages, write, force, pages, locked); 1491 + } 1492 + EXPORT_SYMBOL(get_user_pages_locked8); 1493 + 1494 + long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, 1495 + unsigned long start, unsigned long nr_pages, 1496 + int write, int force, struct page **pages) 1497 + { 1498 + WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task"); 1499 + WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm"); 1500 + 1501 + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); 1502 + } 1503 + EXPORT_SYMBOL(get_user_pages_unlocked7); 1504 +

+9 -3

mm/ksm.c

··· 352 352 /* 353 353 * We use break_ksm to break COW on a ksm page: it's a stripped down 354 354 * 355 - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 355 + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) 356 356 * put_page(page); 357 357 * 358 358 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 359 359 * in case the application has unmapped and remapped mm,addr meanwhile. 360 360 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 361 361 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 362 + * 363 + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context 364 + * of the process that owns 'vma'. We also do not want to enforce 365 + * protection keys here anyway. 362 366 */ 363 367 static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 364 368 { ··· 371 367 372 368 do { 373 369 cond_resched(); 374 - page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); 370 + page = follow_page(vma, addr, 371 + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); 375 372 if (IS_ERR_OR_NULL(page)) 376 373 break; 377 374 if (PageKsm(page)) 378 375 ret = handle_mm_fault(vma->vm_mm, vma, addr, 379 - FAULT_FLAG_WRITE); 376 + FAULT_FLAG_WRITE | 377 + FAULT_FLAG_REMOTE); 380 378 else 381 379 ret = VM_FAULT_WRITE; 382 380 put_page(page);

+7 -1

mm/memory.c

··· 65 65 #include <linux/userfaultfd_k.h> 66 66 67 67 #include <asm/io.h> 68 + #include <asm/mmu_context.h> 68 69 #include <asm/pgalloc.h> 69 70 #include <asm/uaccess.h> 70 71 #include <asm/tlb.h> ··· 3376 3375 pmd_t *pmd; 3377 3376 pte_t *pte; 3378 3377 3378 + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, 3379 + flags & FAULT_FLAG_INSTRUCTION, 3380 + flags & FAULT_FLAG_REMOTE)) 3381 + return VM_FAULT_SIGSEGV; 3382 + 3379 3383 if (unlikely(is_vm_hugetlb_page(vma))) 3380 3384 return hugetlb_fault(mm, vma, address, flags); 3381 3385 ··· 3697 3691 void *maddr; 3698 3692 struct page *page = NULL; 3699 3693 3700 - ret = get_user_pages(tsk, mm, addr, 1, 3694 + ret = get_user_pages_remote(tsk, mm, addr, 1, 3701 3695 write, 1, &page, &vma); 3702 3696 if (ret <= 0) { 3703 3697 #ifndef CONFIG_HAVE_IOREMAP_PROT

+3 -3

mm/mempolicy.c

··· 846 846 } 847 847 } 848 848 849 - static int lookup_node(struct mm_struct *mm, unsigned long addr) 849 + static int lookup_node(unsigned long addr) 850 850 { 851 851 struct page *p; 852 852 int err; 853 853 854 - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 854 + err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); 855 855 if (err >= 0) { 856 856 err = page_to_nid(p); 857 857 put_page(p); ··· 906 906 907 907 if (flags & MPOL_F_NODE) { 908 908 if (flags & MPOL_F_ADDR) { 909 - err = lookup_node(mm, addr); 909 + err = lookup_node(addr); 910 910 if (err < 0) 911 911 goto out; 912 912 *policy = err;

+9 -1

mm/mmap.c

··· 42 42 #include <linux/printk.h> 43 43 #include <linux/userfaultfd_k.h> 44 44 #include <linux/moduleparam.h> 45 + #include <linux/pkeys.h> 45 46 46 47 #include <asm/uaccess.h> 47 48 #include <asm/cacheflush.h> ··· 1146 1145 unsigned long pgoff, unsigned long *populate) 1147 1146 { 1148 1147 struct mm_struct *mm = current->mm; 1148 + int pkey = 0; 1149 1149 1150 1150 *populate = 0; 1151 1151 ··· 1186 1184 if (offset_in_page(addr)) 1187 1185 return addr; 1188 1186 1187 + if (prot == PROT_EXEC) { 1188 + pkey = execute_only_pkey(mm); 1189 + if (pkey < 0) 1190 + pkey = 0; 1191 + } 1192 + 1189 1193 /* Do simple checking here so the lower-level routines won't have 1190 1194 * to. we assume access permissions have been handled by the open 1191 1195 * of the memory object, so we don't do any here. 1192 1196 */ 1193 - vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1197 + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | 1194 1198 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1195 1199 1196 1200 if (flags & MAP_LOCKED)

+4 -4

mm/mprotect.c

··· 24 24 #include <linux/migrate.h> 25 25 #include <linux/perf_event.h> 26 26 #include <linux/ksm.h> 27 + #include <linux/pkeys.h> 27 28 #include <asm/uaccess.h> 28 29 #include <asm/pgtable.h> 29 30 #include <asm/cacheflush.h> ··· 355 354 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 356 355 unsigned long, prot) 357 356 { 358 - unsigned long vm_flags, nstart, end, tmp, reqprot; 357 + unsigned long nstart, end, tmp, reqprot; 359 358 struct vm_area_struct *vma, *prev; 360 359 int error = -EINVAL; 361 360 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); ··· 380 379 */ 381 380 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 382 381 prot |= PROT_EXEC; 383 - 384 - vm_flags = calc_vm_prot_bits(prot); 385 382 386 383 down_write(&current->mm->mmap_sem); 387 384 ··· 410 411 411 412 for (nstart = start ; ; ) { 412 413 unsigned long newflags; 414 + int pkey = arch_override_mprotect_pkey(vma, prot, -1); 413 415 414 416 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 415 417 416 - newflags = vm_flags; 418 + newflags = calc_vm_prot_bits(prot, pkey); 417 419 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 418 420 419 421 /* newflags >> 4 shift VM_MAY% in place of VM_% */

+46 -20

mm/nommu.c

··· 15 15 16 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 17 18 + #define __DISABLE_GUP_DEPRECATED 19 + 18 20 #include <linux/export.h> 19 21 #include <linux/mm.h> 20 22 #include <linux/vmacache.h> ··· 161 159 * slab page or a secondary page from a compound page 162 160 * - don't permit access to VMAs that don't support it, such as I/O mappings 163 161 */ 164 - long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 165 - unsigned long start, unsigned long nr_pages, 162 + long get_user_pages6(unsigned long start, unsigned long nr_pages, 166 163 int write, int force, struct page **pages, 167 164 struct vm_area_struct **vmas) 168 165 { ··· 172 171 if (force) 173 172 flags |= FOLL_FORCE; 174 173 175 - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 176 - NULL); 174 + return __get_user_pages(current, current->mm, start, nr_pages, flags, 175 + pages, vmas, NULL); 177 176 } 178 - EXPORT_SYMBOL(get_user_pages); 177 + EXPORT_SYMBOL(get_user_pages6); 179 178 180 - long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 181 - unsigned long start, unsigned long nr_pages, 182 - int write, int force, struct page **pages, 183 - int *locked) 179 + long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, 180 + int write, int force, struct page **pages, 181 + int *locked) 184 182 { 185 - return get_user_pages(tsk, mm, start, nr_pages, write, force, 186 - pages, NULL); 183 + return get_user_pages6(start, nr_pages, write, force, pages, NULL); 187 184 } 188 - EXPORT_SYMBOL(get_user_pages_locked); 185 + EXPORT_SYMBOL(get_user_pages_locked6); 189 186 190 187 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 191 188 unsigned long start, unsigned long nr_pages, ··· 192 193 { 193 194 long ret; 194 195 down_read(&mm->mmap_sem); 195 - ret = get_user_pages(tsk, mm, start, nr_pages, write, force, 196 - pages, NULL); 196 + ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, 197 + NULL, NULL); 197 198 up_read(&mm->mmap_sem); 198 199 return ret; 199 200 } 200 201 EXPORT_SYMBOL(__get_user_pages_unlocked); 201 202 202 - long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 203 - unsigned long start, unsigned long nr_pages, 203 + long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, 204 204 int write, int force, struct page **pages) 205 205 { 206 - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, 207 - force, pages, 0); 206 + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, 207 + write, force, pages, 0); 208 208 } 209 - EXPORT_SYMBOL(get_user_pages_unlocked); 209 + EXPORT_SYMBOL(get_user_pages_unlocked5); 210 210 211 211 /** 212 212 * follow_pfn - look up PFN at a user virtual address ··· 1059 1061 { 1060 1062 unsigned long vm_flags; 1061 1063 1062 - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1064 + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); 1063 1065 /* vm_flags |= mm->def_flags; */ 1064 1066 1065 1067 if (!(capabilities & NOMMU_MAP_DIRECT)) { ··· 1989 1991 return 0; 1990 1992 } 1991 1993 subsys_initcall(init_admin_reserve); 1994 + 1995 + long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, 1996 + unsigned long start, unsigned long nr_pages, 1997 + int write, int force, struct page **pages, 1998 + struct vm_area_struct **vmas) 1999 + { 2000 + return get_user_pages6(start, nr_pages, write, force, pages, vmas); 2001 + } 2002 + EXPORT_SYMBOL(get_user_pages8); 2003 + 2004 + long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, 2005 + unsigned long start, unsigned long nr_pages, 2006 + int write, int force, struct page **pages, 2007 + int *locked) 2008 + { 2009 + return get_user_pages_locked6(start, nr_pages, write, 2010 + force, pages, locked); 2011 + } 2012 + EXPORT_SYMBOL(get_user_pages_locked8); 2013 + 2014 + long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, 2015 + unsigned long start, unsigned long nr_pages, 2016 + int write, int force, struct page **pages) 2017 + { 2018 + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); 2019 + } 2020 + EXPORT_SYMBOL(get_user_pages_unlocked7); 2021 +

+8 -3

mm/process_vm_access.c

··· 98 98 int pages = min(nr_pages, max_pages_per_loop); 99 99 size_t bytes; 100 100 101 - /* Get the pages we're interested in */ 102 - pages = get_user_pages_unlocked(task, mm, pa, pages, 103 - vm_write, 0, process_pages); 101 + /* 102 + * Get the pages we're interested in. We must 103 + * add FOLL_REMOTE because task/mm might not 104 + * current/current->mm 105 + */ 106 + pages = __get_user_pages_unlocked(task, mm, pa, pages, 107 + vm_write, 0, process_pages, 108 + FOLL_REMOTE); 104 109 if (pages <= 0) 105 110 return -EFAULT; 106 111

+1 -3

mm/util.c

··· 283 283 int __weak get_user_pages_fast(unsigned long start, 284 284 int nr_pages, int write, struct page **pages) 285 285 { 286 - struct mm_struct *mm = current->mm; 287 - return get_user_pages_unlocked(current, mm, start, nr_pages, 288 - write, 0, pages); 286 + return get_user_pages_unlocked(start, nr_pages, write, 0, pages); 289 287 } 290 288 EXPORT_SYMBOL_GPL(get_user_pages_fast); 291 289

+1 -1

net/ceph/pagevec.c

··· 24 24 return ERR_PTR(-ENOMEM); 25 25 26 26 while (got < num_pages) { 27 - rc = get_user_pages_unlocked(current, current->mm, 27 + rc = get_user_pages_unlocked( 28 28 (unsigned long)data + ((unsigned long)got * PAGE_SIZE), 29 29 num_pages - got, write_page, 0, pages + got); 30 30 if (rc < 0)

+8 -1

security/tomoyo/domain.c

··· 874 874 } 875 875 /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */ 876 876 #ifdef CONFIG_MMU 877 - if (get_user_pages(current, bprm->mm, pos, 1, 0, 1, &page, NULL) <= 0) 877 + /* 878 + * This is called at execve() time in order to dig around 879 + * in the argv/environment of the new proceess 880 + * (represented by bprm). 'current' is the process doing 881 + * the execve(). 882 + */ 883 + if (get_user_pages_remote(current, bprm->mm, pos, 1, 884 + 0, 1, &page, NULL) <= 0) 878 885 return false; 879 886 #else 880 887 page = bprm->page[pos / PAGE_SIZE];

+7 -1

virt/kvm/async_pf.c

··· 79 79 80 80 might_sleep(); 81 81 82 - get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL); 82 + /* 83 + * This work is run asynchromously to the task which owns 84 + * mm and might be done in another context, so we must 85 + * use FOLL_REMOTE. 86 + */ 87 + __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE); 88 + 83 89 kvm_async_page_present_sync(vcpu, apf); 84 90 85 91 spin_lock(&vcpu->async_pf.lock);

+5 -5

virt/kvm/kvm_main.c

··· 1260 1260 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1261 1261 } 1262 1262 1263 - static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1264 - unsigned long start, int write, struct page **page) 1263 + static int get_user_page_nowait(unsigned long start, int write, 1264 + struct page **page) 1265 1265 { 1266 1266 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1267 1267 1268 1268 if (write) 1269 1269 flags |= FOLL_WRITE; 1270 1270 1271 - return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1271 + return __get_user_pages(current, current->mm, start, 1, flags, page, 1272 + NULL, NULL); 1272 1273 } 1273 1274 1274 1275 static inline int check_user_page_hwpoison(unsigned long addr) ··· 1331 1330 1332 1331 if (async) { 1333 1332 down_read(&current->mm->mmap_sem); 1334 - npages = get_user_page_nowait(current, current->mm, 1335 - addr, write_fault, page); 1333 + npages = get_user_page_nowait(addr, write_fault, page); 1336 1334 up_read(&current->mm->mmap_sem); 1337 1335 } else 1338 1336 npages = __get_user_pages_unlocked(current, current->mm, addr, 1,