Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/pkeys: Add self-tests

This code should be a good demonstration of how to use the new
system calls as well as how to use protection keys in general.

This code shows how to:
1. Manipulate the Protection Keys Rights User (PKRU) register
2. Set a protection key on memory
3. Fetch and/or modify PKRU from the signal XSAVE state
4. Read the kernel-provided protection key in the siginfo
5. Set up an execute-only mapping

There are currently 13 tests:

test_read_of_write_disabled_region
test_read_of_access_disabled_region
test_write_of_write_disabled_region
test_write_of_access_disabled_region
test_kernel_write_of_access_disabled_region
test_kernel_write_of_write_disabled_region
test_kernel_gup_of_access_disabled_region
test_kernel_gup_write_to_write_disabled_region
test_executing_on_unreadable_memory
test_ptrace_of_child
test_pkey_syscalls_on_non_allocated_pkey
test_pkey_syscalls_bad_args
test_pkey_alloc_exhaust

Each of the tests is run with plain memory (via mmap(MAP_ANON)),
transparent huge pages, and hugetlb.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: mgorman@techsingularity.net
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: shuahkh@osg.samsung.com
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163024.FC5A0C2D@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

authored by

Dave Hansen and committed by
Thomas Gleixner
5f23f6d0 76de9937

+1631 -1
+2 -1
tools/testing/selftests/x86/Makefile
··· 5 5 .PHONY: all all_32 all_64 warn_32bit_failure clean 6 6 7 7 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ 8 - check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test 8 + check_initial_reg_state sigreturn ldt_gdt iopl \ 9 + protection_keys 9 10 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ 10 11 test_FCMOV test_FCOMI test_FISTTP \ 11 12 vdso_restorer
+219
tools/testing/selftests/x86/pkey-helpers.h
··· 1 + #ifndef _PKEYS_HELPER_H 2 + #define _PKEYS_HELPER_H 3 + #define _GNU_SOURCE 4 + #include <string.h> 5 + #include <stdarg.h> 6 + #include <stdio.h> 7 + #include <stdint.h> 8 + #include <stdbool.h> 9 + #include <signal.h> 10 + #include <assert.h> 11 + #include <stdlib.h> 12 + #include <ucontext.h> 13 + #include <sys/mman.h> 14 + 15 + #define NR_PKEYS 16 16 + #define PKRU_BITS_PER_PKEY 2 17 + 18 + #ifndef DEBUG_LEVEL 19 + #define DEBUG_LEVEL 0 20 + #endif 21 + #define DPRINT_IN_SIGNAL_BUF_SIZE 4096 22 + extern int dprint_in_signal; 23 + extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; 24 + static inline void sigsafe_printf(const char *format, ...) 25 + { 26 + va_list ap; 27 + 28 + va_start(ap, format); 29 + if (!dprint_in_signal) { 30 + vprintf(format, ap); 31 + } else { 32 + int len = vsnprintf(dprint_in_signal_buffer, 33 + DPRINT_IN_SIGNAL_BUF_SIZE, 34 + format, ap); 35 + /* 36 + * len is amount that would have been printed, 37 + * but actual write is truncated at BUF_SIZE. 38 + */ 39 + if (len > DPRINT_IN_SIGNAL_BUF_SIZE) 40 + len = DPRINT_IN_SIGNAL_BUF_SIZE; 41 + write(1, dprint_in_signal_buffer, len); 42 + } 43 + va_end(ap); 44 + } 45 + #define dprintf_level(level, args...) do { \ 46 + if (level <= DEBUG_LEVEL) \ 47 + sigsafe_printf(args); \ 48 + fflush(NULL); \ 49 + } while (0) 50 + #define dprintf0(args...) dprintf_level(0, args) 51 + #define dprintf1(args...) dprintf_level(1, args) 52 + #define dprintf2(args...) dprintf_level(2, args) 53 + #define dprintf3(args...) dprintf_level(3, args) 54 + #define dprintf4(args...) dprintf_level(4, args) 55 + 56 + extern unsigned int shadow_pkru; 57 + static inline unsigned int __rdpkru(void) 58 + { 59 + unsigned int eax, edx; 60 + unsigned int ecx = 0; 61 + unsigned int pkru; 62 + 63 + asm volatile(".byte 0x0f,0x01,0xee\n\t" 64 + : "=a" (eax), "=d" (edx) 65 + : "c" (ecx)); 66 + pkru = eax; 67 + return pkru; 68 + } 69 + 70 + static inline unsigned int _rdpkru(int line) 71 + { 72 + unsigned int pkru = __rdpkru(); 73 + 74 + dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", 75 + line, pkru, shadow_pkru); 76 + assert(pkru == shadow_pkru); 77 + 78 + return pkru; 79 + } 80 + 81 + #define rdpkru() _rdpkru(__LINE__) 82 + 83 + static inline void __wrpkru(unsigned int pkru) 84 + { 85 + unsigned int eax = pkru; 86 + unsigned int ecx = 0; 87 + unsigned int edx = 0; 88 + 89 + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); 90 + asm volatile(".byte 0x0f,0x01,0xef\n\t" 91 + : : "a" (eax), "c" (ecx), "d" (edx)); 92 + assert(pkru == __rdpkru()); 93 + } 94 + 95 + static inline void wrpkru(unsigned int pkru) 96 + { 97 + dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); 98 + /* will do the shadow check for us: */ 99 + rdpkru(); 100 + __wrpkru(pkru); 101 + shadow_pkru = pkru; 102 + dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); 103 + } 104 + 105 + /* 106 + * These are technically racy. since something could 107 + * change PKRU between the read and the write. 108 + */ 109 + static inline void __pkey_access_allow(int pkey, int do_allow) 110 + { 111 + unsigned int pkru = rdpkru(); 112 + int bit = pkey * 2; 113 + 114 + if (do_allow) 115 + pkru &= (1<<bit); 116 + else 117 + pkru |= (1<<bit); 118 + 119 + dprintf4("pkru now: %08x\n", rdpkru()); 120 + wrpkru(pkru); 121 + } 122 + 123 + static inline void __pkey_write_allow(int pkey, int do_allow_write) 124 + { 125 + long pkru = rdpkru(); 126 + int bit = pkey * 2 + 1; 127 + 128 + if (do_allow_write) 129 + pkru &= (1<<bit); 130 + else 131 + pkru |= (1<<bit); 132 + 133 + wrpkru(pkru); 134 + dprintf4("pkru now: %08x\n", rdpkru()); 135 + } 136 + 137 + #define PROT_PKEY0 0x10 /* protection key value (bit 0) */ 138 + #define PROT_PKEY1 0x20 /* protection key value (bit 1) */ 139 + #define PROT_PKEY2 0x40 /* protection key value (bit 2) */ 140 + #define PROT_PKEY3 0x80 /* protection key value (bit 3) */ 141 + 142 + #define PAGE_SIZE 4096 143 + #define MB (1<<20) 144 + 145 + static inline void __cpuid(unsigned int *eax, unsigned int *ebx, 146 + unsigned int *ecx, unsigned int *edx) 147 + { 148 + /* ecx is often an input as well as an output. */ 149 + asm volatile( 150 + "cpuid;" 151 + : "=a" (*eax), 152 + "=b" (*ebx), 153 + "=c" (*ecx), 154 + "=d" (*edx) 155 + : "0" (*eax), "2" (*ecx)); 156 + } 157 + 158 + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ 159 + #define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ 160 + #define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ 161 + 162 + static inline int cpu_has_pku(void) 163 + { 164 + unsigned int eax; 165 + unsigned int ebx; 166 + unsigned int ecx; 167 + unsigned int edx; 168 + 169 + eax = 0x7; 170 + ecx = 0x0; 171 + __cpuid(&eax, &ebx, &ecx, &edx); 172 + 173 + if (!(ecx & X86_FEATURE_PKU)) { 174 + dprintf2("cpu does not have PKU\n"); 175 + return 0; 176 + } 177 + if (!(ecx & X86_FEATURE_OSPKE)) { 178 + dprintf2("cpu does not have OSPKE\n"); 179 + return 0; 180 + } 181 + return 1; 182 + } 183 + 184 + #define XSTATE_PKRU_BIT (9) 185 + #define XSTATE_PKRU 0x200 186 + 187 + int pkru_xstate_offset(void) 188 + { 189 + unsigned int eax; 190 + unsigned int ebx; 191 + unsigned int ecx; 192 + unsigned int edx; 193 + int xstate_offset; 194 + int xstate_size; 195 + unsigned long XSTATE_CPUID = 0xd; 196 + int leaf; 197 + 198 + /* assume that XSTATE_PKRU is set in XCR0 */ 199 + leaf = XSTATE_PKRU_BIT; 200 + { 201 + eax = XSTATE_CPUID; 202 + ecx = leaf; 203 + __cpuid(&eax, &ebx, &ecx, &edx); 204 + 205 + if (leaf == XSTATE_PKRU_BIT) { 206 + xstate_offset = ebx; 207 + xstate_size = eax; 208 + } 209 + } 210 + 211 + if (xstate_size == 0) { 212 + printf("could not find size/offset of PKRU in xsave state\n"); 213 + return 0; 214 + } 215 + 216 + return xstate_offset; 217 + } 218 + 219 + #endif /* _PKEYS_HELPER_H */
+1410
tools/testing/selftests/x86/protection_keys.c
··· 1 + /* 2 + * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt) 3 + * 4 + * There are examples in here of: 5 + * * how to set protection keys on memory 6 + * * how to set/clear bits in PKRU (the rights register) 7 + * * how to handle SEGV_PKRU signals and extract pkey-relevant 8 + * information from the siginfo 9 + * 10 + * Things to add: 11 + * make sure KSM and KSM COW breaking works 12 + * prefault pages in at malloc, or not 13 + * protect MPX bounds tables with protection keys? 14 + * make sure VMA splitting/merging is working correctly 15 + * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys 16 + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel 17 + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks 18 + * 19 + * Compile like this: 20 + * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 21 + * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 22 + */ 23 + #define _GNU_SOURCE 24 + #include <errno.h> 25 + #include <linux/futex.h> 26 + #include <sys/time.h> 27 + #include <sys/syscall.h> 28 + #include <string.h> 29 + #include <stdio.h> 30 + #include <stdint.h> 31 + #include <stdbool.h> 32 + #include <signal.h> 33 + #include <assert.h> 34 + #include <stdlib.h> 35 + #include <ucontext.h> 36 + #include <sys/mman.h> 37 + #include <sys/types.h> 38 + #include <sys/wait.h> 39 + #include <sys/stat.h> 40 + #include <fcntl.h> 41 + #include <unistd.h> 42 + #include <sys/ptrace.h> 43 + #include <setjmp.h> 44 + 45 + #include "pkey-helpers.h" 46 + 47 + int iteration_nr = 1; 48 + int test_nr; 49 + 50 + unsigned int shadow_pkru; 51 + 52 + #define HPAGE_SIZE (1UL<<21) 53 + #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) 54 + #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) 55 + #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) 56 + #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) 57 + #define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) 58 + #define __stringify_1(x...) #x 59 + #define __stringify(x...) __stringify_1(x) 60 + 61 + #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) 62 + 63 + int dprint_in_signal; 64 + char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; 65 + 66 + extern void abort_hooks(void); 67 + #define pkey_assert(condition) do { \ 68 + if (!(condition)) { \ 69 + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ 70 + __FILE__, __LINE__, \ 71 + test_nr, iteration_nr); \ 72 + dprintf0("errno at assert: %d", errno); \ 73 + abort_hooks(); \ 74 + assert(condition); \ 75 + } \ 76 + } while (0) 77 + #define raw_assert(cond) assert(cond) 78 + 79 + void cat_into_file(char *str, char *file) 80 + { 81 + int fd = open(file, O_RDWR); 82 + int ret; 83 + 84 + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); 85 + /* 86 + * these need to be raw because they are called under 87 + * pkey_assert() 88 + */ 89 + raw_assert(fd >= 0); 90 + ret = write(fd, str, strlen(str)); 91 + if (ret != strlen(str)) { 92 + perror("write to file failed"); 93 + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 94 + raw_assert(0); 95 + } 96 + close(fd); 97 + } 98 + 99 + #if CONTROL_TRACING > 0 100 + static int warned_tracing; 101 + int tracing_root_ok(void) 102 + { 103 + if (geteuid() != 0) { 104 + if (!warned_tracing) 105 + fprintf(stderr, "WARNING: not run as root, " 106 + "can not do tracing control\n"); 107 + warned_tracing = 1; 108 + return 0; 109 + } 110 + return 1; 111 + } 112 + #endif 113 + 114 + void tracing_on(void) 115 + { 116 + #if CONTROL_TRACING > 0 117 + #define TRACEDIR "/sys/kernel/debug/tracing" 118 + char pidstr[32]; 119 + 120 + if (!tracing_root_ok()) 121 + return; 122 + 123 + sprintf(pidstr, "%d", getpid()); 124 + cat_into_file("0", TRACEDIR "/tracing_on"); 125 + cat_into_file("\n", TRACEDIR "/trace"); 126 + if (1) { 127 + cat_into_file("function_graph", TRACEDIR "/current_tracer"); 128 + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); 129 + } else { 130 + cat_into_file("nop", TRACEDIR "/current_tracer"); 131 + } 132 + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); 133 + cat_into_file("1", TRACEDIR "/tracing_on"); 134 + dprintf1("enabled tracing\n"); 135 + #endif 136 + } 137 + 138 + void tracing_off(void) 139 + { 140 + #if CONTROL_TRACING > 0 141 + if (!tracing_root_ok()) 142 + return; 143 + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); 144 + #endif 145 + } 146 + 147 + void abort_hooks(void) 148 + { 149 + fprintf(stderr, "running %s()...\n", __func__); 150 + tracing_off(); 151 + #ifdef SLEEP_ON_ABORT 152 + sleep(SLEEP_ON_ABORT); 153 + #endif 154 + } 155 + 156 + static inline void __page_o_noops(void) 157 + { 158 + /* 8-bytes of instruction * 512 bytes = 1 page */ 159 + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); 160 + } 161 + 162 + /* 163 + * This attempts to have roughly a page of instructions followed by a few 164 + * instructions that do a write, and another page of instructions. That 165 + * way, we are pretty sure that the write is in the second page of 166 + * instructions and has at least a page of padding behind it. 167 + * 168 + * *That* lets us be sure to madvise() away the write instruction, which 169 + * will then fault, which makes sure that the fault code handles 170 + * execute-only memory properly. 171 + */ 172 + __attribute__((__aligned__(PAGE_SIZE))) 173 + void lots_o_noops_around_write(int *write_to_me) 174 + { 175 + dprintf3("running %s()\n", __func__); 176 + __page_o_noops(); 177 + /* Assume this happens in the second page of instructions: */ 178 + *write_to_me = __LINE__; 179 + /* pad out by another page: */ 180 + __page_o_noops(); 181 + dprintf3("%s() done\n", __func__); 182 + } 183 + 184 + /* Define some kernel-like types */ 185 + #define u8 uint8_t 186 + #define u16 uint16_t 187 + #define u32 uint32_t 188 + #define u64 uint64_t 189 + 190 + #ifdef __i386__ 191 + #define SYS_mprotect_key 380 192 + #define SYS_pkey_alloc 381 193 + #define SYS_pkey_free 382 194 + #define REG_IP_IDX REG_EIP 195 + #define si_pkey_offset 0x18 196 + #else 197 + #define SYS_mprotect_key 329 198 + #define SYS_pkey_alloc 330 199 + #define SYS_pkey_free 331 200 + #define REG_IP_IDX REG_RIP 201 + #define si_pkey_offset 0x20 202 + #endif 203 + 204 + void dump_mem(void *dumpme, int len_bytes) 205 + { 206 + char *c = (void *)dumpme; 207 + int i; 208 + 209 + for (i = 0; i < len_bytes; i += sizeof(u64)) { 210 + u64 *ptr = (u64 *)(c + i); 211 + dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); 212 + } 213 + } 214 + 215 + #define __SI_FAULT (3 << 16) 216 + #define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ 217 + #define SEGV_PKUERR (__SI_FAULT|4) 218 + 219 + static char *si_code_str(int si_code) 220 + { 221 + if (si_code & SEGV_MAPERR) 222 + return "SEGV_MAPERR"; 223 + if (si_code & SEGV_ACCERR) 224 + return "SEGV_ACCERR"; 225 + if (si_code & SEGV_BNDERR) 226 + return "SEGV_BNDERR"; 227 + if (si_code & SEGV_PKUERR) 228 + return "SEGV_PKUERR"; 229 + return "UNKNOWN"; 230 + } 231 + 232 + int pkru_faults; 233 + int last_si_pkey = -1; 234 + void signal_handler(int signum, siginfo_t *si, void *vucontext) 235 + { 236 + ucontext_t *uctxt = vucontext; 237 + int trapno; 238 + unsigned long ip; 239 + char *fpregs; 240 + u32 *pkru_ptr; 241 + u64 si_pkey; 242 + u32 *si_pkey_ptr; 243 + int pkru_offset; 244 + fpregset_t fpregset; 245 + 246 + dprint_in_signal = 1; 247 + dprintf1(">>>>===============SIGSEGV============================\n"); 248 + dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, 249 + __rdpkru(), shadow_pkru); 250 + 251 + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; 252 + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; 253 + fpregset = uctxt->uc_mcontext.fpregs; 254 + fpregs = (void *)fpregset; 255 + 256 + dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, 257 + trapno, ip, si_code_str(si->si_code), si->si_code); 258 + #ifdef __i386__ 259 + /* 260 + * 32-bit has some extra padding so that userspace can tell whether 261 + * the XSTATE header is present in addition to the "legacy" FPU 262 + * state. We just assume that it is here. 263 + */ 264 + fpregs += 0x70; 265 + #endif 266 + pkru_offset = pkru_xstate_offset(); 267 + pkru_ptr = (void *)(&fpregs[pkru_offset]); 268 + 269 + dprintf1("siginfo: %p\n", si); 270 + dprintf1(" fpregs: %p\n", fpregs); 271 + /* 272 + * If we got a PKRU fault, we *HAVE* to have at least one bit set in 273 + * here. 274 + */ 275 + dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); 276 + if (DEBUG_LEVEL > 4) 277 + dump_mem(pkru_ptr - 128, 256); 278 + pkey_assert(*pkru_ptr); 279 + 280 + si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); 281 + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); 282 + dump_mem(si_pkey_ptr - 8, 24); 283 + si_pkey = *si_pkey_ptr; 284 + pkey_assert(si_pkey < NR_PKEYS); 285 + last_si_pkey = si_pkey; 286 + 287 + if ((si->si_code == SEGV_MAPERR) || 288 + (si->si_code == SEGV_ACCERR) || 289 + (si->si_code == SEGV_BNDERR)) { 290 + printf("non-PK si_code, exiting...\n"); 291 + exit(4); 292 + } 293 + 294 + dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); 295 + /* need __rdpkru() version so we do not do shadow_pkru checking */ 296 + dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); 297 + dprintf1("si_pkey from siginfo: %jx\n", si_pkey); 298 + *(u64 *)pkru_ptr = 0x00000000; 299 + dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); 300 + pkru_faults++; 301 + dprintf1("<<<<==================================================\n"); 302 + return; 303 + if (trapno == 14) { 304 + fprintf(stderr, 305 + "ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n", 306 + trapno, ip); 307 + fprintf(stderr, "si_addr %p\n", si->si_addr); 308 + fprintf(stderr, "REG_ERR: %lx\n", 309 + (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); 310 + exit(1); 311 + } else { 312 + fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip); 313 + fprintf(stderr, "si_addr %p\n", si->si_addr); 314 + fprintf(stderr, "REG_ERR: %lx\n", 315 + (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); 316 + exit(2); 317 + } 318 + dprint_in_signal = 0; 319 + } 320 + 321 + int wait_all_children(void) 322 + { 323 + int status; 324 + return waitpid(-1, &status, 0); 325 + } 326 + 327 + void sig_chld(int x) 328 + { 329 + dprint_in_signal = 1; 330 + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); 331 + dprint_in_signal = 0; 332 + } 333 + 334 + void setup_sigsegv_handler(void) 335 + { 336 + int r, rs; 337 + struct sigaction newact; 338 + struct sigaction oldact; 339 + 340 + /* #PF is mapped to sigsegv */ 341 + int signum = SIGSEGV; 342 + 343 + newact.sa_handler = 0; 344 + newact.sa_sigaction = signal_handler; 345 + 346 + /*sigset_t - signals to block while in the handler */ 347 + /* get the old signal mask. */ 348 + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); 349 + pkey_assert(rs == 0); 350 + 351 + /* call sa_sigaction, not sa_handler*/ 352 + newact.sa_flags = SA_SIGINFO; 353 + 354 + newact.sa_restorer = 0; /* void(*)(), obsolete */ 355 + r = sigaction(signum, &newact, &oldact); 356 + r = sigaction(SIGALRM, &newact, &oldact); 357 + pkey_assert(r == 0); 358 + } 359 + 360 + void setup_handlers(void) 361 + { 362 + signal(SIGCHLD, &sig_chld); 363 + setup_sigsegv_handler(); 364 + } 365 + 366 + pid_t fork_lazy_child(void) 367 + { 368 + pid_t forkret; 369 + 370 + forkret = fork(); 371 + pkey_assert(forkret >= 0); 372 + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 373 + 374 + if (!forkret) { 375 + /* in the child */ 376 + while (1) { 377 + dprintf1("child sleeping...\n"); 378 + sleep(30); 379 + } 380 + } 381 + return forkret; 382 + } 383 + 384 + void davecmp(void *_a, void *_b, int len) 385 + { 386 + int i; 387 + unsigned long *a = _a; 388 + unsigned long *b = _b; 389 + 390 + for (i = 0; i < len / sizeof(*a); i++) { 391 + if (a[i] == b[i]) 392 + continue; 393 + 394 + dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]); 395 + } 396 + } 397 + 398 + void dumpit(char *f) 399 + { 400 + int fd = open(f, O_RDONLY); 401 + char buf[100]; 402 + int nr_read; 403 + 404 + dprintf2("maps fd: %d\n", fd); 405 + do { 406 + nr_read = read(fd, &buf[0], sizeof(buf)); 407 + write(1, buf, nr_read); 408 + } while (nr_read > 0); 409 + close(fd); 410 + } 411 + 412 + #define PKEY_DISABLE_ACCESS 0x1 413 + #define PKEY_DISABLE_WRITE 0x2 414 + 415 + u32 pkey_get(int pkey, unsigned long flags) 416 + { 417 + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 418 + u32 pkru = __rdpkru(); 419 + u32 shifted_pkru; 420 + u32 masked_pkru; 421 + 422 + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", 423 + __func__, pkey, flags, 0, 0); 424 + dprintf2("%s() raw pkru: %x\n", __func__, pkru); 425 + 426 + shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); 427 + dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); 428 + masked_pkru = shifted_pkru & mask; 429 + dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); 430 + /* 431 + * shift down the relevant bits to the lowest two, then 432 + * mask off all the other high bits. 433 + */ 434 + return masked_pkru; 435 + } 436 + 437 + int pkey_set(int pkey, unsigned long rights, unsigned long flags) 438 + { 439 + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 440 + u32 old_pkru = __rdpkru(); 441 + u32 new_pkru; 442 + 443 + /* make sure that 'rights' only contains the bits we expect: */ 444 + assert(!(rights & ~mask)); 445 + 446 + /* copy old pkru */ 447 + new_pkru = old_pkru; 448 + /* mask out bits from pkey in old value: */ 449 + new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); 450 + /* OR in new bits for pkey: */ 451 + new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); 452 + 453 + __wrpkru(new_pkru); 454 + 455 + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", 456 + __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); 457 + return 0; 458 + } 459 + 460 + void pkey_disable_set(int pkey, int flags) 461 + { 462 + unsigned long syscall_flags = 0; 463 + int ret; 464 + int pkey_rights; 465 + u32 orig_pkru; 466 + 467 + dprintf1("START->%s(%d, 0x%x)\n", __func__, 468 + pkey, flags); 469 + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 470 + 471 + pkey_rights = pkey_get(pkey, syscall_flags); 472 + 473 + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 474 + pkey, pkey, pkey_rights); 475 + pkey_assert(pkey_rights >= 0); 476 + 477 + pkey_rights |= flags; 478 + 479 + ret = pkey_set(pkey, pkey_rights, syscall_flags); 480 + assert(!ret); 481 + /*pkru and flags have the same format */ 482 + shadow_pkru |= flags << (pkey * 2); 483 + dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); 484 + 485 + pkey_assert(ret >= 0); 486 + 487 + pkey_rights = pkey_get(pkey, syscall_flags); 488 + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 489 + pkey, pkey, pkey_rights); 490 + 491 + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); 492 + if (flags) 493 + pkey_assert(rdpkru() > orig_pkru); 494 + dprintf1("END<---%s(%d, 0x%x)\n", __func__, 495 + pkey, flags); 496 + } 497 + 498 + void pkey_disable_clear(int pkey, int flags) 499 + { 500 + unsigned long syscall_flags = 0; 501 + int ret; 502 + int pkey_rights = pkey_get(pkey, syscall_flags); 503 + u32 orig_pkru = rdpkru(); 504 + 505 + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 506 + 507 + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 508 + pkey, pkey, pkey_rights); 509 + pkey_assert(pkey_rights >= 0); 510 + 511 + pkey_rights |= flags; 512 + 513 + ret = pkey_set(pkey, pkey_rights, 0); 514 + /* pkru and flags have the same format */ 515 + shadow_pkru &= ~(flags << (pkey * 2)); 516 + pkey_assert(ret >= 0); 517 + 518 + pkey_rights = pkey_get(pkey, syscall_flags); 519 + dprintf1("%s(%d) pkey_get(%d): %x\n", __func__, 520 + pkey, pkey, pkey_rights); 521 + 522 + dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); 523 + if (flags) 524 + assert(rdpkru() > orig_pkru); 525 + } 526 + 527 + void pkey_write_allow(int pkey) 528 + { 529 + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); 530 + } 531 + void pkey_write_deny(int pkey) 532 + { 533 + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); 534 + } 535 + void pkey_access_allow(int pkey) 536 + { 537 + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); 538 + } 539 + void pkey_access_deny(int pkey) 540 + { 541 + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); 542 + } 543 + 544 + int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 545 + unsigned long pkey) 546 + { 547 + int sret; 548 + 549 + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, 550 + ptr, size, orig_prot, pkey); 551 + 552 + errno = 0; 553 + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); 554 + if (errno) { 555 + dprintf2("SYS_mprotect_key sret: %d\n", sret); 556 + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); 557 + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); 558 + if (DEBUG_LEVEL >= 2) 559 + perror("SYS_mprotect_pkey"); 560 + } 561 + return sret; 562 + } 563 + 564 + int sys_pkey_alloc(unsigned long flags, unsigned long init_val) 565 + { 566 + int ret = syscall(SYS_pkey_alloc, flags, init_val); 567 + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", 568 + __func__, flags, init_val, ret, errno); 569 + return ret; 570 + } 571 + 572 + int alloc_pkey(void) 573 + { 574 + int ret; 575 + unsigned long init_val = 0x0; 576 + 577 + dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", 578 + __LINE__, __rdpkru(), shadow_pkru); 579 + ret = sys_pkey_alloc(0, init_val); 580 + /* 581 + * pkey_alloc() sets PKRU, so we need to reflect it in 582 + * shadow_pkru: 583 + */ 584 + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", 585 + __LINE__, ret, __rdpkru(), shadow_pkru); 586 + if (ret) { 587 + /* clear both the bits: */ 588 + shadow_pkru &= ~(0x3 << (ret * 2)); 589 + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", 590 + __LINE__, ret, __rdpkru(), shadow_pkru); 591 + /* 592 + * move the new state in from init_val 593 + * (remember, we cheated and init_val == pkru format) 594 + */ 595 + shadow_pkru |= (init_val << (ret * 2)); 596 + } 597 + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", 598 + __LINE__, ret, __rdpkru(), shadow_pkru); 599 + dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); 600 + /* for shadow checking: */ 601 + rdpkru(); 602 + dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", 603 + __LINE__, ret, __rdpkru(), shadow_pkru); 604 + return ret; 605 + } 606 + 607 + int sys_pkey_free(unsigned long pkey) 608 + { 609 + int ret = syscall(SYS_pkey_free, pkey); 610 + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); 611 + return ret; 612 + } 613 + 614 + /* 615 + * I had a bug where pkey bits could be set by mprotect() but 616 + * not cleared. This ensures we get lots of random bit sets 617 + * and clears on the vma and pte pkey bits. 618 + */ 619 + int alloc_random_pkey(void) 620 + { 621 + int max_nr_pkey_allocs; 622 + int ret; 623 + int i; 624 + int alloced_pkeys[NR_PKEYS]; 625 + int nr_alloced = 0; 626 + int random_index; 627 + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); 628 + 629 + /* allocate every possible key and make a note of which ones we got */ 630 + max_nr_pkey_allocs = NR_PKEYS; 631 + max_nr_pkey_allocs = 1; 632 + for (i = 0; i < max_nr_pkey_allocs; i++) { 633 + int new_pkey = alloc_pkey(); 634 + if (new_pkey < 0) 635 + break; 636 + alloced_pkeys[nr_alloced++] = new_pkey; 637 + } 638 + 639 + pkey_assert(nr_alloced > 0); 640 + /* select a random one out of the allocated ones */ 641 + random_index = rand() % nr_alloced; 642 + ret = alloced_pkeys[random_index]; 643 + /* now zero it out so we don't free it next */ 644 + alloced_pkeys[random_index] = 0; 645 + 646 + /* go through the allocated ones that we did not want and free them */ 647 + for (i = 0; i < nr_alloced; i++) { 648 + int free_ret; 649 + if (!alloced_pkeys[i]) 650 + continue; 651 + free_ret = sys_pkey_free(alloced_pkeys[i]); 652 + pkey_assert(!free_ret); 653 + } 654 + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, 655 + __LINE__, ret, __rdpkru(), shadow_pkru); 656 + return ret; 657 + } 658 + 659 + int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 660 + unsigned long pkey) 661 + { 662 + int nr_iterations = random() % 100; 663 + int ret; 664 + 665 + while (0) { 666 + int rpkey = alloc_random_pkey(); 667 + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 668 + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 669 + ptr, size, orig_prot, pkey, ret); 670 + if (nr_iterations-- < 0) 671 + break; 672 + 673 + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, 674 + __LINE__, ret, __rdpkru(), shadow_pkru); 675 + sys_pkey_free(rpkey); 676 + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, 677 + __LINE__, ret, __rdpkru(), shadow_pkru); 678 + } 679 + pkey_assert(pkey < NR_PKEYS); 680 + 681 + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 682 + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 683 + ptr, size, orig_prot, pkey, ret); 684 + pkey_assert(!ret); 685 + dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, 686 + __LINE__, ret, __rdpkru(), shadow_pkru); 687 + return ret; 688 + } 689 + 690 + struct pkey_malloc_record { 691 + void *ptr; 692 + long size; 693 + }; 694 + struct pkey_malloc_record *pkey_malloc_records; 695 + long nr_pkey_malloc_records; 696 + void record_pkey_malloc(void *ptr, long size) 697 + { 698 + long i; 699 + struct pkey_malloc_record *rec = NULL; 700 + 701 + for (i = 0; i < nr_pkey_malloc_records; i++) { 702 + rec = &pkey_malloc_records[i]; 703 + /* find a free record */ 704 + if (rec) 705 + break; 706 + } 707 + if (!rec) { 708 + /* every record is full */ 709 + size_t old_nr_records = nr_pkey_malloc_records; 710 + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); 711 + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); 712 + dprintf2("new_nr_records: %zd\n", new_nr_records); 713 + dprintf2("new_size: %zd\n", new_size); 714 + pkey_malloc_records = realloc(pkey_malloc_records, new_size); 715 + pkey_assert(pkey_malloc_records != NULL); 716 + rec = &pkey_malloc_records[nr_pkey_malloc_records]; 717 + /* 718 + * realloc() does not initialize memory, so zero it from 719 + * the first new record all the way to the end. 720 + */ 721 + for (i = 0; i < new_nr_records - old_nr_records; i++) 722 + memset(rec + i, 0, sizeof(*rec)); 723 + } 724 + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", 725 + (int)(rec - pkey_malloc_records), rec, ptr, size); 726 + rec->ptr = ptr; 727 + rec->size = size; 728 + nr_pkey_malloc_records++; 729 + } 730 + 731 + void free_pkey_malloc(void *ptr) 732 + { 733 + long i; 734 + int ret; 735 + dprintf3("%s(%p)\n", __func__, ptr); 736 + for (i = 0; i < nr_pkey_malloc_records; i++) { 737 + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; 738 + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", 739 + ptr, i, rec, rec->ptr, rec->size); 740 + if ((ptr < rec->ptr) || 741 + (ptr >= rec->ptr + rec->size)) 742 + continue; 743 + 744 + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", 745 + ptr, i, rec, rec->ptr, rec->size); 746 + nr_pkey_malloc_records--; 747 + ret = munmap(rec->ptr, rec->size); 748 + dprintf3("munmap ret: %d\n", ret); 749 + pkey_assert(!ret); 750 + dprintf3("clearing rec->ptr, rec: %p\n", rec); 751 + rec->ptr = NULL; 752 + dprintf3("done clearing rec->ptr, rec: %p\n", rec); 753 + return; 754 + } 755 + pkey_assert(false); 756 + } 757 + 758 + 759 + void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) 760 + { 761 + void *ptr; 762 + int ret; 763 + 764 + rdpkru(); 765 + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 766 + size, prot, pkey); 767 + pkey_assert(pkey < NR_PKEYS); 768 + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 769 + pkey_assert(ptr != (void *)-1); 770 + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 771 + pkey_assert(!ret); 772 + record_pkey_malloc(ptr, size); 773 + rdpkru(); 774 + 775 + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 776 + return ptr; 777 + } 778 + 779 + void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) 780 + { 781 + int ret; 782 + void *ptr; 783 + 784 + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 785 + size, prot, pkey); 786 + /* 787 + * Guarantee we can fit at least one huge page in the resulting 788 + * allocation by allocating space for 2: 789 + */ 790 + size = ALIGN_UP(size, HPAGE_SIZE * 2); 791 + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 792 + pkey_assert(ptr != (void *)-1); 793 + record_pkey_malloc(ptr, size); 794 + mprotect_pkey(ptr, size, prot, pkey); 795 + 796 + dprintf1("unaligned ptr: %p\n", ptr); 797 + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); 798 + dprintf1(" aligned ptr: %p\n", ptr); 799 + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); 800 + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); 801 + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); 802 + dprintf1("MADV_WILLNEED ret: %d\n", ret); 803 + memset(ptr, 0, HPAGE_SIZE); 804 + 805 + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); 806 + return ptr; 807 + } 808 + 809 + int hugetlb_setup_ok; 810 + #define GET_NR_HUGE_PAGES 10 811 + void setup_hugetlbfs(void) 812 + { 813 + int err; 814 + int fd; 815 + int validated_nr_pages; 816 + int i; 817 + char buf[] = "123"; 818 + 819 + if (geteuid() != 0) { 820 + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); 821 + return; 822 + } 823 + 824 + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); 825 + 826 + /* 827 + * Now go make sure that we got the pages and that they 828 + * are 2M pages. Someone might have made 1G the default. 829 + */ 830 + fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); 831 + if (fd < 0) { 832 + perror("opening sysfs 2M hugetlb config"); 833 + return; 834 + } 835 + 836 + /* -1 to guarantee leaving the trailing \0 */ 837 + err = read(fd, buf, sizeof(buf)-1); 838 + close(fd); 839 + if (err <= 0) { 840 + perror("reading sysfs 2M hugetlb config"); 841 + return; 842 + } 843 + 844 + if (atoi(buf) != GET_NR_HUGE_PAGES) { 845 + fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", 846 + buf, GET_NR_HUGE_PAGES); 847 + return; 848 + } 849 + 850 + hugetlb_setup_ok = 1; 851 + } 852 + 853 + void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) 854 + { 855 + void *ptr; 856 + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; 857 + 858 + if (!hugetlb_setup_ok) 859 + return PTR_ERR_ENOTSUP; 860 + 861 + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); 862 + size = ALIGN_UP(size, HPAGE_SIZE * 2); 863 + pkey_assert(pkey < NR_PKEYS); 864 + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); 865 + pkey_assert(ptr != (void *)-1); 866 + mprotect_pkey(ptr, size, prot, pkey); 867 + 868 + record_pkey_malloc(ptr, size); 869 + 870 + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 871 + return ptr; 872 + } 873 + 874 + void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) 875 + { 876 + void *ptr; 877 + int fd; 878 + 879 + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 880 + size, prot, pkey); 881 + pkey_assert(pkey < NR_PKEYS); 882 + fd = open("/dax/foo", O_RDWR); 883 + pkey_assert(fd >= 0); 884 + 885 + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); 886 + pkey_assert(ptr != (void *)-1); 887 + 888 + mprotect_pkey(ptr, size, prot, pkey); 889 + 890 + record_pkey_malloc(ptr, size); 891 + 892 + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); 893 + close(fd); 894 + return ptr; 895 + } 896 + 897 + void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { 898 + 899 + malloc_pkey_with_mprotect, 900 + malloc_pkey_anon_huge, 901 + malloc_pkey_hugetlb 902 + /* can not do direct with the pkey_mprotect() API: 903 + malloc_pkey_mmap_direct, 904 + malloc_pkey_mmap_dax, 905 + */ 906 + }; 907 + 908 + void *malloc_pkey(long size, int prot, u16 pkey) 909 + { 910 + void *ret; 911 + static int malloc_type; 912 + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); 913 + 914 + pkey_assert(pkey < NR_PKEYS); 915 + 916 + while (1) { 917 + pkey_assert(malloc_type < nr_malloc_types); 918 + 919 + ret = pkey_malloc[malloc_type](size, prot, pkey); 920 + pkey_assert(ret != (void *)-1); 921 + 922 + malloc_type++; 923 + if (malloc_type >= nr_malloc_types) 924 + malloc_type = (random()%nr_malloc_types); 925 + 926 + /* try again if the malloc_type we tried is unsupported */ 927 + if (ret == PTR_ERR_ENOTSUP) 928 + continue; 929 + 930 + break; 931 + } 932 + 933 + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, 934 + size, prot, pkey, ret); 935 + return ret; 936 + } 937 + 938 + int last_pkru_faults; 939 + void expected_pk_fault(int pkey) 940 + { 941 + dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", 942 + __func__, last_pkru_faults, pkru_faults); 943 + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 944 + pkey_assert(last_pkru_faults + 1 == pkru_faults); 945 + pkey_assert(last_si_pkey == pkey); 946 + /* 947 + * The signal handler shold have cleared out PKRU to let the 948 + * test program continue. We now have to restore it. 949 + */ 950 + if (__rdpkru() != 0) 951 + pkey_assert(0); 952 + 953 + __wrpkru(shadow_pkru); 954 + dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", 955 + __func__, shadow_pkru); 956 + last_pkru_faults = pkru_faults; 957 + last_si_pkey = -1; 958 + } 959 + 960 + void do_not_expect_pk_fault(void) 961 + { 962 + pkey_assert(last_pkru_faults == pkru_faults); 963 + } 964 + 965 + int test_fds[10] = { -1 }; 966 + int nr_test_fds; 967 + void __save_test_fd(int fd) 968 + { 969 + pkey_assert(fd >= 0); 970 + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); 971 + test_fds[nr_test_fds] = fd; 972 + nr_test_fds++; 973 + } 974 + 975 + int get_test_read_fd(void) 976 + { 977 + int test_fd = open("/etc/passwd", O_RDONLY); 978 + __save_test_fd(test_fd); 979 + return test_fd; 980 + } 981 + 982 + void close_test_fds(void) 983 + { 984 + int i; 985 + 986 + for (i = 0; i < nr_test_fds; i++) { 987 + if (test_fds[i] < 0) 988 + continue; 989 + close(test_fds[i]); 990 + test_fds[i] = -1; 991 + } 992 + nr_test_fds = 0; 993 + } 994 + 995 + #define barrier() __asm__ __volatile__("": : :"memory") 996 + __attribute__((noinline)) int read_ptr(int *ptr) 997 + { 998 + /* 999 + * Keep GCC from optimizing this away somehow 1000 + */ 1001 + barrier(); 1002 + return *ptr; 1003 + } 1004 + 1005 + void test_read_of_write_disabled_region(int *ptr, u16 pkey) 1006 + { 1007 + int ptr_contents; 1008 + 1009 + dprintf1("disabling write access to PKEY[1], doing read\n"); 1010 + pkey_write_deny(pkey); 1011 + ptr_contents = read_ptr(ptr); 1012 + dprintf1("*ptr: %d\n", ptr_contents); 1013 + dprintf1("\n"); 1014 + } 1015 + void test_read_of_access_disabled_region(int *ptr, u16 pkey) 1016 + { 1017 + int ptr_contents; 1018 + 1019 + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); 1020 + rdpkru(); 1021 + pkey_access_deny(pkey); 1022 + ptr_contents = read_ptr(ptr); 1023 + dprintf1("*ptr: %d\n", ptr_contents); 1024 + expected_pk_fault(pkey); 1025 + } 1026 + void test_write_of_write_disabled_region(int *ptr, u16 pkey) 1027 + { 1028 + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); 1029 + pkey_write_deny(pkey); 1030 + *ptr = __LINE__; 1031 + expected_pk_fault(pkey); 1032 + } 1033 + void test_write_of_access_disabled_region(int *ptr, u16 pkey) 1034 + { 1035 + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); 1036 + pkey_access_deny(pkey); 1037 + *ptr = __LINE__; 1038 + expected_pk_fault(pkey); 1039 + } 1040 + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) 1041 + { 1042 + int ret; 1043 + int test_fd = get_test_read_fd(); 1044 + 1045 + dprintf1("disabling access to PKEY[%02d], " 1046 + "having kernel read() to buffer\n", pkey); 1047 + pkey_access_deny(pkey); 1048 + ret = read(test_fd, ptr, 1); 1049 + dprintf1("read ret: %d\n", ret); 1050 + pkey_assert(ret); 1051 + } 1052 + void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) 1053 + { 1054 + int ret; 1055 + int test_fd = get_test_read_fd(); 1056 + 1057 + pkey_write_deny(pkey); 1058 + ret = read(test_fd, ptr, 100); 1059 + dprintf1("read ret: %d\n", ret); 1060 + if (ret < 0 && (DEBUG_LEVEL > 0)) 1061 + perror("verbose read result (OK for this to be bad)"); 1062 + pkey_assert(ret); 1063 + } 1064 + 1065 + void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) 1066 + { 1067 + int pipe_ret, vmsplice_ret; 1068 + struct iovec iov; 1069 + int pipe_fds[2]; 1070 + 1071 + pipe_ret = pipe(pipe_fds); 1072 + 1073 + pkey_assert(pipe_ret == 0); 1074 + dprintf1("disabling access to PKEY[%02d], " 1075 + "having kernel vmsplice from buffer\n", pkey); 1076 + pkey_access_deny(pkey); 1077 + iov.iov_base = ptr; 1078 + iov.iov_len = PAGE_SIZE; 1079 + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); 1080 + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); 1081 + pkey_assert(vmsplice_ret == -1); 1082 + 1083 + close(pipe_fds[0]); 1084 + close(pipe_fds[1]); 1085 + } 1086 + 1087 + void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) 1088 + { 1089 + int ignored = 0xdada; 1090 + int futex_ret; 1091 + int some_int = __LINE__; 1092 + 1093 + dprintf1("disabling write to PKEY[%02d], " 1094 + "doing futex gunk in buffer\n", pkey); 1095 + *ptr = some_int; 1096 + pkey_write_deny(pkey); 1097 + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, 1098 + &ignored, ignored); 1099 + if (DEBUG_LEVEL > 0) 1100 + perror("futex"); 1101 + dprintf1("futex() ret: %d\n", futex_ret); 1102 + } 1103 + 1104 + /* Assumes that all pkeys other than 'pkey' are unallocated */ 1105 + void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) 1106 + { 1107 + int err; 1108 + int i; 1109 + 1110 + /* Note: 0 is the default pkey, so don't mess with it */ 1111 + for (i = 1; i < NR_PKEYS; i++) { 1112 + if (pkey == i) 1113 + continue; 1114 + 1115 + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); 1116 + err = sys_pkey_free(i); 1117 + pkey_assert(err); 1118 + 1119 + /* not enforced when pkey_get() is not a syscall 1120 + err = pkey_get(i, 0); 1121 + pkey_assert(err < 0); 1122 + */ 1123 + 1124 + err = sys_pkey_free(i); 1125 + pkey_assert(err); 1126 + 1127 + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); 1128 + pkey_assert(err); 1129 + } 1130 + } 1131 + 1132 + /* Assumes that all pkeys other than 'pkey' are unallocated */ 1133 + void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) 1134 + { 1135 + int err; 1136 + int bad_flag = (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + 1; 1137 + int bad_pkey = NR_PKEYS+99; 1138 + 1139 + /* not enforced when pkey_get() is not a syscall 1140 + err = pkey_get(bad_pkey, bad_flag); 1141 + pkey_assert(err < 0); 1142 + */ 1143 + 1144 + /* pass a known-invalid pkey in: */ 1145 + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); 1146 + pkey_assert(err); 1147 + } 1148 + 1149 + /* Assumes that all pkeys other than 'pkey' are unallocated */ 1150 + void test_pkey_alloc_exhaust(int *ptr, u16 pkey) 1151 + { 1152 + unsigned long flags; 1153 + unsigned long init_val; 1154 + int err; 1155 + int allocated_pkeys[NR_PKEYS] = {0}; 1156 + int nr_allocated_pkeys = 0; 1157 + int i; 1158 + 1159 + for (i = 0; i < NR_PKEYS*2; i++) { 1160 + int new_pkey; 1161 + dprintf1("%s() alloc loop: %d\n", __func__, i); 1162 + new_pkey = alloc_pkey(); 1163 + dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, 1164 + __LINE__, err, __rdpkru(), shadow_pkru); 1165 + rdpkru(); /* for shadow checking */ 1166 + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); 1167 + if ((new_pkey == -1) && (errno == ENOSPC)) { 1168 + dprintf2("%s() failed to allocate pkey after %d tries\n", 1169 + __func__, nr_allocated_pkeys); 1170 + break; 1171 + } 1172 + pkey_assert(nr_allocated_pkeys < NR_PKEYS); 1173 + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1174 + } 1175 + 1176 + dprintf3("%s()::%d\n", __func__, __LINE__); 1177 + 1178 + /* 1179 + * ensure it did not reach the end of the loop without 1180 + * failure: 1181 + */ 1182 + pkey_assert(i < NR_PKEYS*2); 1183 + 1184 + /* 1185 + * There are 16 pkeys supported in hardware. One is taken 1186 + * up for the default (0) and another can be taken up by 1187 + * an execute-only mapping. Ensure that we can allocate 1188 + * at least 14 (16-2). 1189 + */ 1190 + pkey_assert(i >= NR_PKEYS-2); 1191 + 1192 + for (i = 0; i < nr_allocated_pkeys; i++) { 1193 + err = sys_pkey_free(allocated_pkeys[i]); 1194 + pkey_assert(!err); 1195 + rdpkru(); /* for shadow checking */ 1196 + } 1197 + } 1198 + 1199 + void test_ptrace_of_child(int *ptr, u16 pkey) 1200 + { 1201 + __attribute__((__unused__)) int peek_result; 1202 + pid_t child_pid; 1203 + void *ignored = 0; 1204 + long ret; 1205 + int status; 1206 + /* 1207 + * This is the "control" for our little expermient. Make sure 1208 + * we can always access it when ptracing. 1209 + */ 1210 + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); 1211 + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); 1212 + 1213 + /* 1214 + * Fork a child which is an exact copy of this process, of course. 1215 + * That means we can do all of our tests via ptrace() and then plain 1216 + * memory access and ensure they work differently. 1217 + */ 1218 + child_pid = fork_lazy_child(); 1219 + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); 1220 + 1221 + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); 1222 + if (ret) 1223 + perror("attach"); 1224 + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); 1225 + pkey_assert(ret != -1); 1226 + ret = waitpid(child_pid, &status, WUNTRACED); 1227 + if ((ret != child_pid) || !(WIFSTOPPED(status))) { 1228 + fprintf(stderr, "weird waitpid result %ld stat %x\n", 1229 + ret, status); 1230 + pkey_assert(0); 1231 + } 1232 + dprintf2("waitpid ret: %ld\n", ret); 1233 + dprintf2("waitpid status: %d\n", status); 1234 + 1235 + pkey_access_deny(pkey); 1236 + pkey_write_deny(pkey); 1237 + 1238 + /* Write access, untested for now: 1239 + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); 1240 + pkey_assert(ret != -1); 1241 + dprintf1("poke at %p: %ld\n", peek_at, ret); 1242 + */ 1243 + 1244 + /* 1245 + * Try to access the pkey-protected "ptr" via ptrace: 1246 + */ 1247 + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); 1248 + /* expect it to work, without an error: */ 1249 + pkey_assert(ret != -1); 1250 + /* Now access from the current task, and expect an exception: */ 1251 + peek_result = read_ptr(ptr); 1252 + expected_pk_fault(pkey); 1253 + 1254 + /* 1255 + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: 1256 + */ 1257 + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); 1258 + /* expect it to work, without an error: */ 1259 + pkey_assert(ret != -1); 1260 + /* Now access from the current task, and expect NO exception: */ 1261 + peek_result = read_ptr(plain_ptr); 1262 + do_not_expect_pk_fault(); 1263 + 1264 + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1265 + pkey_assert(ret != -1); 1266 + 1267 + ret = kill(child_pid, SIGKILL); 1268 + pkey_assert(ret != -1); 1269 + 1270 + wait(&status); 1271 + 1272 + free(plain_ptr_unaligned); 1273 + } 1274 + 1275 + void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1276 + { 1277 + void *p1; 1278 + int scratch; 1279 + int ptr_contents; 1280 + int ret; 1281 + 1282 + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1283 + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1284 + /* lots_o_noops_around_write should be page-aligned already */ 1285 + assert(p1 == &lots_o_noops_around_write); 1286 + 1287 + /* Point 'p1' at the *second* page of the function: */ 1288 + p1 += PAGE_SIZE; 1289 + 1290 + madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1291 + lots_o_noops_around_write(&scratch); 1292 + ptr_contents = read_ptr(p1); 1293 + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1294 + 1295 + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); 1296 + pkey_assert(!ret); 1297 + pkey_access_deny(pkey); 1298 + 1299 + dprintf2("pkru: %x\n", rdpkru()); 1300 + 1301 + /* 1302 + * Make sure this is an *instruction* fault 1303 + */ 1304 + madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1305 + lots_o_noops_around_write(&scratch); 1306 + do_not_expect_pk_fault(); 1307 + ptr_contents = read_ptr(p1); 1308 + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1309 + expected_pk_fault(pkey); 1310 + } 1311 + 1312 + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1313 + { 1314 + int size = PAGE_SIZE; 1315 + int sret; 1316 + 1317 + if (cpu_has_pku()) { 1318 + dprintf1("SKIP: %s: no CPU support\n", __func__); 1319 + return; 1320 + } 1321 + 1322 + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); 1323 + pkey_assert(sret < 0); 1324 + } 1325 + 1326 + void (*pkey_tests[])(int *ptr, u16 pkey) = { 1327 + test_read_of_write_disabled_region, 1328 + test_read_of_access_disabled_region, 1329 + test_write_of_write_disabled_region, 1330 + test_write_of_access_disabled_region, 1331 + test_kernel_write_of_access_disabled_region, 1332 + test_kernel_write_of_write_disabled_region, 1333 + test_kernel_gup_of_access_disabled_region, 1334 + test_kernel_gup_write_to_write_disabled_region, 1335 + test_executing_on_unreadable_memory, 1336 + test_ptrace_of_child, 1337 + test_pkey_syscalls_on_non_allocated_pkey, 1338 + test_pkey_syscalls_bad_args, 1339 + test_pkey_alloc_exhaust, 1340 + }; 1341 + 1342 + void run_tests_once(void) 1343 + { 1344 + int *ptr; 1345 + int prot = PROT_READ|PROT_WRITE; 1346 + 1347 + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { 1348 + int pkey; 1349 + int orig_pkru_faults = pkru_faults; 1350 + 1351 + dprintf1("======================\n"); 1352 + dprintf1("test %d preparing...\n", test_nr); 1353 + 1354 + tracing_on(); 1355 + pkey = alloc_random_pkey(); 1356 + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); 1357 + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); 1358 + dprintf1("test %d starting...\n", test_nr); 1359 + pkey_tests[test_nr](ptr, pkey); 1360 + dprintf1("freeing test memory: %p\n", ptr); 1361 + free_pkey_malloc(ptr); 1362 + sys_pkey_free(pkey); 1363 + 1364 + dprintf1("pkru_faults: %d\n", pkru_faults); 1365 + dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); 1366 + 1367 + tracing_off(); 1368 + close_test_fds(); 1369 + 1370 + printf("test %2d PASSED (itertation %d)\n", test_nr, iteration_nr); 1371 + dprintf1("======================\n\n"); 1372 + } 1373 + iteration_nr++; 1374 + } 1375 + 1376 + void pkey_setup_shadow(void) 1377 + { 1378 + shadow_pkru = __rdpkru(); 1379 + } 1380 + 1381 + int main(void) 1382 + { 1383 + int nr_iterations = 22; 1384 + 1385 + setup_handlers(); 1386 + 1387 + printf("has pku: %d\n", cpu_has_pku()); 1388 + 1389 + if (!cpu_has_pku()) { 1390 + int size = PAGE_SIZE; 1391 + int *ptr; 1392 + 1393 + printf("running PKEY tests for unsupported CPU/OS\n"); 1394 + 1395 + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1396 + assert(ptr != (void *)-1); 1397 + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); 1398 + exit(0); 1399 + } 1400 + 1401 + pkey_setup_shadow(); 1402 + printf("startup pkru: %x\n", rdpkru()); 1403 + setup_hugetlbfs(); 1404 + 1405 + while (nr_iterations-- > 0) 1406 + run_tests_once(); 1407 + 1408 + printf("done (all tests OK)\n"); 1409 + return 0; 1410 + }