Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/pkeys: Allocation/free syscalls

This patch adds two new system calls:

int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);

These implement an "allocator" for the protection keys
themselves, which can be thought of as analogous to the allocator
that the kernel has for file descriptors. The kernel tracks
which numbers are in use, and only allows operations on keys that
are valid. A key which was not obtained by pkey_alloc() may not,
for instance, be passed to pkey_mprotect().

These system calls are also very important given the kernel's use
of pkeys to implement execute-only support. These help ensure
that userspace can never assume that it has control of a key
unless it first asks the kernel. The kernel does not promise to
preserve PKRU (right register) contents except for allocated
pkeys.

The 'init_access_rights' argument to pkey_alloc() specifies the
rights that will be established for the returned pkey. For
instance:

pkey = pkey_alloc(flags, PKEY_DENY_WRITE);

will allocate 'pkey', but also sets the bits in PKRU[1] such that
writing to 'pkey' is already denied.

The kernel does not prevent pkey_free() from successfully freeing
in-use pkeys (those still assigned to a memory range by
pkey_mprotect()). It would be expensive to implement the checks
for this, so we instead say, "Just don't do it" since sane
software will never do it anyway.

Any piece of userspace calling pkey_alloc() needs to be prepared
for it to fail. Why? pkey_alloc() returns the same error code
(ENOSPC) when there are no pkeys and when pkeys are unsupported.
They can be unsupported for a whole host of reasons, so apps must
be prepared for this. Also, libraries or LD_PRELOADs might steal
keys before an application gets access to them.

This allocation mechanism could be implemented in userspace.
Even if we did it in userspace, we would still need additional
user/kernel interfaces to tell userspace which keys are being
used by the kernel internally (such as for execute-only
mappings). Having the kernel provide this facility completely
removes the need for these additional interfaces, or having an
implementation of this in userspace at all.

Note that we have to make changes to all of the architectures
that do not use mman-common.h because we use the new
PKEY_DENY_ACCESS/WRITE macros in arch-independent code.

1. PKRU is the Protection Key Rights User register. It is a
usermode-accessible register that controls whether writes
and/or access to each individual pkey is allowed or denied.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

authored by

Dave Hansen and committed by
Thomas Gleixner
e8c24d3a a8502b67

+221 -27
+5
arch/alpha/include/uapi/asm/mman.h
··· 78 78 #define MAP_HUGE_SHIFT 26 79 79 #define MAP_HUGE_MASK 0x3f 80 80 81 + #define PKEY_DISABLE_ACCESS 0x1 82 + #define PKEY_DISABLE_WRITE 0x2 83 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 84 + PKEY_DISABLE_WRITE) 85 + 81 86 #endif /* __ALPHA_MMAN_H__ */
+5
arch/mips/include/uapi/asm/mman.h
··· 105 105 #define MAP_HUGE_SHIFT 26 106 106 #define MAP_HUGE_MASK 0x3f 107 107 108 + #define PKEY_DISABLE_ACCESS 0x1 109 + #define PKEY_DISABLE_WRITE 0x2 110 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 111 + PKEY_DISABLE_WRITE) 112 + 108 113 #endif /* _ASM_MMAN_H */
+5
arch/parisc/include/uapi/asm/mman.h
··· 75 75 #define MAP_HUGE_SHIFT 26 76 76 #define MAP_HUGE_MASK 0x3f 77 77 78 + #define PKEY_DISABLE_ACCESS 0x1 79 + #define PKEY_DISABLE_WRITE 0x2 80 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 81 + PKEY_DISABLE_WRITE) 82 + 78 83 #endif /* __PARISC_MMAN_H__ */
+8
arch/x86/include/asm/mmu.h
··· 23 23 const struct vdso_image *vdso_image; /* vdso image in use */ 24 24 25 25 atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ 26 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 27 + /* 28 + * One bit per protection key says whether userspace can 29 + * use it or not. protected by mmap_sem. 30 + */ 31 + u16 pkey_allocation_map; 32 + s16 execute_only_pkey; 33 + #endif 26 34 } mm_context_t; 27 35 28 36 #ifdef CONFIG_SMP
+9 -1
arch/x86/include/asm/mmu_context.h
··· 108 108 static inline int init_new_context(struct task_struct *tsk, 109 109 struct mm_struct *mm) 110 110 { 111 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 112 + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 113 + /* pkey 0 is the default and always allocated */ 114 + mm->context.pkey_allocation_map = 0x1; 115 + /* -1 means unallocated or invalid */ 116 + mm->context.execute_only_pkey = -1; 117 + } 118 + #endif 111 119 init_new_context_ldt(tsk, mm); 120 + 112 121 return 0; 113 122 } 114 123 static inline void destroy_context(struct mm_struct *mm) ··· 272 263 { 273 264 return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); 274 265 } 275 - 276 266 #endif /* _ASM_X86_MMU_CONTEXT_H */
+67 -6
arch/x86/include/asm/pkeys.h
··· 1 1 #ifndef _ASM_X86_PKEYS_H 2 2 #define _ASM_X86_PKEYS_H 3 3 4 - #define PKEY_DEDICATED_EXECUTE_ONLY 15 5 - /* 6 - * Consider the PKEY_DEDICATED_EXECUTE_ONLY key unavailable. 7 - */ 8 - #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? \ 9 - PKEY_DEDICATED_EXECUTE_ONLY : 1) 4 + #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) 10 5 11 6 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 12 7 unsigned long init_val); ··· 34 39 unsigned long init_val); 35 40 36 41 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) 42 + 43 + #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) 44 + #define mm_set_pkey_allocated(mm, pkey) do { \ 45 + mm_pkey_allocation_map(mm) |= (1U << pkey); \ 46 + } while (0) 47 + #define mm_set_pkey_free(mm, pkey) do { \ 48 + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ 49 + } while (0) 50 + 51 + static inline 52 + bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) 53 + { 54 + return mm_pkey_allocation_map(mm) & (1U << pkey); 55 + } 56 + 57 + /* 58 + * Returns a positive, 4-bit key on success, or -1 on failure. 59 + */ 60 + static inline 61 + int mm_pkey_alloc(struct mm_struct *mm) 62 + { 63 + /* 64 + * Note: this is the one and only place we make sure 65 + * that the pkey is valid as far as the hardware is 66 + * concerned. The rest of the kernel trusts that 67 + * only good, valid pkeys come out of here. 68 + */ 69 + u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); 70 + int ret; 71 + 72 + /* 73 + * Are we out of pkeys? We must handle this specially 74 + * because ffz() behavior is undefined if there are no 75 + * zeros. 76 + */ 77 + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) 78 + return -1; 79 + 80 + ret = ffz(mm_pkey_allocation_map(mm)); 81 + 82 + mm_set_pkey_allocated(mm, ret); 83 + 84 + return ret; 85 + } 86 + 87 + static inline 88 + int mm_pkey_free(struct mm_struct *mm, int pkey) 89 + { 90 + /* 91 + * pkey 0 is special, always allocated and can never 92 + * be freed. 93 + */ 94 + if (!pkey) 95 + return -EINVAL; 96 + if (!mm_pkey_is_allocated(mm, pkey)) 97 + return -EINVAL; 98 + 99 + mm_set_pkey_free(mm, pkey); 100 + 101 + return 0; 102 + } 103 + 104 + extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 105 + unsigned long init_val); 106 + extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 107 + unsigned long init_val); 37 108 38 109 #endif /*_ASM_X86_PKEYS_H */
+4 -1
arch/x86/kernel/fpu/xstate.c
··· 5 5 */ 6 6 #include <linux/compat.h> 7 7 #include <linux/cpu.h> 8 + #include <linux/mman.h> 8 9 #include <linux/pkeys.h> 9 10 10 11 #include <asm/fpu/api.h> ··· 867 866 return get_xsave_addr(&fpu->state.xsave, xsave_state); 868 867 } 869 868 869 + #ifdef CONFIG_ARCH_HAS_PKEYS 870 + 870 871 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) 871 872 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) 872 - 873 873 /* 874 874 * This will go out and modify PKRU register to set the access 875 875 * rights for @pkey to @init_val. ··· 916 914 917 915 return 0; 918 916 } 917 + #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 919 918 920 919 /* 921 920 * This is similar to user_regset_copyout(), but will not add offset to
+30 -8
arch/x86/mm/pkeys.c
··· 21 21 22 22 int __execute_only_pkey(struct mm_struct *mm) 23 23 { 24 + bool need_to_set_mm_pkey = false; 25 + int execute_only_pkey = mm->context.execute_only_pkey; 24 26 int ret; 27 + 28 + /* Do we need to assign a pkey for mm's execute-only maps? */ 29 + if (execute_only_pkey == -1) { 30 + /* Go allocate one to use, which might fail */ 31 + execute_only_pkey = mm_pkey_alloc(mm); 32 + if (execute_only_pkey < 0) 33 + return -1; 34 + need_to_set_mm_pkey = true; 35 + } 25 36 26 37 /* 27 38 * We do not want to go through the relatively costly ··· 43 32 * can make fpregs inactive. 44 33 */ 45 34 preempt_disable(); 46 - if (fpregs_active() && 47 - !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { 35 + if (!need_to_set_mm_pkey && 36 + fpregs_active() && 37 + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { 48 38 preempt_enable(); 49 - return PKEY_DEDICATED_EXECUTE_ONLY; 39 + return execute_only_pkey; 50 40 } 51 41 preempt_enable(); 52 - ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, 42 + 43 + /* 44 + * Set up PKRU so that it denies access for everything 45 + * other than execution. 46 + */ 47 + ret = arch_set_user_pkey_access(current, execute_only_pkey, 53 48 PKEY_DISABLE_ACCESS); 54 49 /* 55 50 * If the PKRU-set operation failed somehow, just return 56 51 * 0 and effectively disable execute-only support. 57 52 */ 58 - if (ret) 59 - return 0; 53 + if (ret) { 54 + mm_set_pkey_free(mm, execute_only_pkey); 55 + return -1; 56 + } 60 57 61 - return PKEY_DEDICATED_EXECUTE_ONLY; 58 + /* We got one, store it and use it from here on out */ 59 + if (need_to_set_mm_pkey) 60 + mm->context.execute_only_pkey = execute_only_pkey; 61 + return execute_only_pkey; 62 62 } 63 63 64 64 static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) ··· 77 55 /* Do this check first since the vm_flags should be hot */ 78 56 if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) 79 57 return false; 80 - if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) 58 + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) 81 59 return false; 82 60 83 61 return true;
+5
arch/xtensa/include/uapi/asm/mman.h
··· 117 117 #define MAP_HUGE_SHIFT 26 118 118 #define MAP_HUGE_MASK 0x3f 119 119 120 + #define PKEY_DISABLE_ACCESS 0x1 121 + #define PKEY_DISABLE_WRITE 0x2 122 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 123 + PKEY_DISABLE_WRITE) 124 + 120 125 #endif /* _XTENSA_MMAN_H */
+23 -5
include/linux/pkeys.h
··· 4 4 #include <linux/mm_types.h> 5 5 #include <asm/mmu_context.h> 6 6 7 - #define PKEY_DISABLE_ACCESS 0x1 8 - #define PKEY_DISABLE_WRITE 0x2 9 - #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 10 - PKEY_DISABLE_WRITE) 11 - 12 7 #ifdef CONFIG_ARCH_HAS_PKEYS 13 8 #include <asm/pkeys.h> 14 9 #else /* ! CONFIG_ARCH_HAS_PKEYS */ ··· 12 17 #define arch_override_mprotect_pkey(vma, prot, pkey) (0) 13 18 #define PKEY_DEDICATED_EXECUTE_ONLY 0 14 19 #define ARCH_VM_PKEY_FLAGS 0 20 + 21 + static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) 22 + { 23 + return (pkey == 0); 24 + } 25 + 26 + static inline int mm_pkey_alloc(struct mm_struct *mm) 27 + { 28 + return -1; 29 + } 30 + 31 + static inline int mm_pkey_free(struct mm_struct *mm, int pkey) 32 + { 33 + WARN_ONCE(1, "free of protection key when disabled"); 34 + return -EINVAL; 35 + } 36 + 37 + static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 38 + unsigned long init_val) 39 + { 40 + return 0; 41 + } 42 + 15 43 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 16 44 17 45 #endif /* _LINUX_PKEYS_H */
+5
include/uapi/asm-generic/mman-common.h
··· 72 72 #define MAP_HUGE_SHIFT 26 73 73 #define MAP_HUGE_MASK 0x3f 74 74 75 + #define PKEY_DISABLE_ACCESS 0x1 76 + #define PKEY_DISABLE_WRITE 0x2 77 + #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ 78 + PKEY_DISABLE_WRITE) 79 + 75 80 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
+55 -6
mm/mprotect.c
··· 23 23 #include <linux/mmu_notifier.h> 24 24 #include <linux/migrate.h> 25 25 #include <linux/perf_event.h> 26 + #include <linux/pkeys.h> 26 27 #include <linux/ksm.h> 27 28 #include <linux/pkeys.h> 28 29 #include <asm/uaccess.h> 29 30 #include <asm/pgtable.h> 30 31 #include <asm/cacheflush.h> 32 + #include <asm/mmu_context.h> 31 33 #include <asm/tlbflush.h> 32 34 33 35 #include "internal.h" ··· 366 364 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 367 365 const bool rier = (current->personality & READ_IMPLIES_EXEC) && 368 366 (prot & PROT_READ); 369 - /* 370 - * A temporary safety check since we are not validating 371 - * the pkey before we introduce the allocation code. 372 - */ 373 - if (pkey != -1) 374 - return -EINVAL; 375 367 376 368 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); 377 369 if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ ··· 386 390 387 391 if (down_write_killable(&current->mm->mmap_sem)) 388 392 return -EINTR; 393 + 394 + /* 395 + * If userspace did not allocate the pkey, do not let 396 + * them use it here. 397 + */ 398 + error = -EINVAL; 399 + if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) 400 + goto out; 389 401 390 402 vma = find_vma(current->mm, start); 391 403 error = -ENOMEM; ··· 488 484 unsigned long, prot, int, pkey) 489 485 { 490 486 return do_mprotect_pkey(start, len, prot, pkey); 487 + } 488 + 489 + SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) 490 + { 491 + int pkey; 492 + int ret; 493 + 494 + /* No flags supported yet. */ 495 + if (flags) 496 + return -EINVAL; 497 + /* check for unsupported init values */ 498 + if (init_val & ~PKEY_ACCESS_MASK) 499 + return -EINVAL; 500 + 501 + down_write(&current->mm->mmap_sem); 502 + pkey = mm_pkey_alloc(current->mm); 503 + 504 + ret = -ENOSPC; 505 + if (pkey == -1) 506 + goto out; 507 + 508 + ret = arch_set_user_pkey_access(current, pkey, init_val); 509 + if (ret) { 510 + mm_pkey_free(current->mm, pkey); 511 + goto out; 512 + } 513 + ret = pkey; 514 + out: 515 + up_write(&current->mm->mmap_sem); 516 + return ret; 517 + } 518 + 519 + SYSCALL_DEFINE1(pkey_free, int, pkey) 520 + { 521 + int ret; 522 + 523 + down_write(&current->mm->mmap_sem); 524 + ret = mm_pkey_free(current->mm, pkey); 525 + up_write(&current->mm->mmap_sem); 526 + 527 + /* 528 + * We could provie warnings or errors if any VMA still 529 + * has the pkey set here. 530 + */ 531 + return ret; 491 532 }