Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'random-6.11-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random

Pull random number generator updates from Jason Donenfeld:
"This adds getrandom() support to the vDSO.

First, it adds a new kind of mapping to mmap(2), MAP_DROPPABLE, which
lets the kernel zero out pages anytime under memory pressure, which
enables allocating memory that never gets swapped to disk but also
doesn't count as being mlocked.

Then, the vDSO implementation of getrandom() is introduced in a
generic manner and hooked into random.c.

Next, this is implemented on x86. (Also, though it's not ready for
this pull, somebody has begun an arm64 implementation already)

Finally, two vDSO selftests are added.

There are also two housekeeping cleanup commits"

* tag 'random-6.11-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random:
MAINTAINERS: add random.h headers to RNG subsection
random: note that RNDGETPOOL was removed in 2.6.9-rc2
selftests/vDSO: add tests for vgetrandom
x86: vdso: Wire up getrandom() vDSO implementation
random: introduce generic vDSO getrandom() implementation
mm: add MAP_DROPPABLE for designating always lazily freeable mappings

+1121 -18
+6
MAINTAINERS
··· 19057 19057 T: git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git 19058 19058 F: Documentation/devicetree/bindings/rng/microsoft,vmgenid.yaml 19059 19059 F: drivers/char/random.c 19060 + F: include/linux/random.h 19061 + F: include/uapi/linux/random.h 19060 19062 F: drivers/virt/vmgenid.c 19063 + F: include/vdso/getrandom.h 19064 + F: lib/vdso/getrandom.c 19065 + F: arch/x86/entry/vdso/vgetrandom* 19066 + F: arch/x86/include/asm/vdso/getrandom* 19061 19067 19062 19068 RAPIDIO SUBSYSTEM 19063 19069 M: Matt Porter <mporter@kernel.crashing.org>
+1
arch/x86/Kconfig
··· 287 287 select HAVE_UNSTABLE_SCHED_CLOCK 288 288 select HAVE_USER_RETURN_NOTIFIER 289 289 select HAVE_GENERIC_VDSO 290 + select VDSO_GETRANDOM if X86_64 290 291 select HOTPLUG_PARALLEL if SMP && X86_64 291 292 select HOTPLUG_SMT if SMP 292 293 select HOTPLUG_SPLIT_STARTUP if SMP && X86_32
+2 -1
arch/x86/entry/vdso/Makefile
··· 7 7 include $(srctree)/lib/vdso/Makefile 8 8 9 9 # Files to link into the vDSO: 10 - vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o 10 + vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o 11 11 vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o 12 12 vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o 13 13 vobjs-$(CONFIG_X86_SGX) += vsgx.o ··· 73 73 CFLAGS_REMOVE_vgetcpu.o = -pg 74 74 CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg 75 75 CFLAGS_REMOVE_vsgx.o = -pg 76 + CFLAGS_REMOVE_vgetrandom.o = -pg 76 77 77 78 # 78 79 # X32 processes use x32 vDSO to access 64bit kernel data.
+2
arch/x86/entry/vdso/vdso.lds.S
··· 30 30 #ifdef CONFIG_X86_SGX 31 31 __vdso_sgx_enter_enclave; 32 32 #endif 33 + getrandom; 34 + __vdso_getrandom; 33 35 local: *; 34 36 }; 35 37 }
+178
arch/x86/entry/vdso/vgetrandom-chacha.S
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <linux/linkage.h> 7 + #include <asm/frame.h> 8 + 9 + .section .rodata, "a" 10 + .align 16 11 + CONSTANTS: .octa 0x6b20657479622d323320646e61707865 12 + .text 13 + 14 + /* 15 + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number 16 + * of blocks of output with a nonce of 0, taking an input key and 8-byte 17 + * counter. Importantly does not spill to the stack. Its arguments are: 18 + * 19 + * rdi: output bytes 20 + * rsi: 32-byte key input 21 + * rdx: 8-byte counter input/output 22 + * rcx: number of 64-byte blocks to write to output 23 + */ 24 + SYM_FUNC_START(__arch_chacha20_blocks_nostack) 25 + 26 + .set output, %rdi 27 + .set key, %rsi 28 + .set counter, %rdx 29 + .set nblocks, %rcx 30 + .set i, %al 31 + /* xmm registers are *not* callee-save. */ 32 + .set temp, %xmm0 33 + .set state0, %xmm1 34 + .set state1, %xmm2 35 + .set state2, %xmm3 36 + .set state3, %xmm4 37 + .set copy0, %xmm5 38 + .set copy1, %xmm6 39 + .set copy2, %xmm7 40 + .set copy3, %xmm8 41 + .set one, %xmm9 42 + 43 + /* copy0 = "expand 32-byte k" */ 44 + movaps CONSTANTS(%rip),copy0 45 + /* copy1,copy2 = key */ 46 + movups 0x00(key),copy1 47 + movups 0x10(key),copy2 48 + /* copy3 = counter || zero nonce */ 49 + movq 0x00(counter),copy3 50 + /* one = 1 || 0 */ 51 + movq $1,%rax 52 + movq %rax,one 53 + 54 + .Lblock: 55 + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ 56 + movdqa copy0,state0 57 + movdqa copy1,state1 58 + movdqa copy2,state2 59 + movdqa copy3,state3 60 + 61 + movb $10,i 62 + .Lpermute: 63 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 64 + paddd state1,state0 65 + pxor state0,state3 66 + movdqa state3,temp 67 + pslld $16,temp 68 + psrld $16,state3 69 + por temp,state3 70 + 71 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 72 + paddd state3,state2 73 + pxor state2,state1 74 + movdqa state1,temp 75 + pslld $12,temp 76 + psrld $20,state1 77 + por temp,state1 78 + 79 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 80 + paddd state1,state0 81 + pxor state0,state3 82 + movdqa state3,temp 83 + pslld $8,temp 84 + psrld $24,state3 85 + por temp,state3 86 + 87 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 88 + paddd state3,state2 89 + pxor state2,state1 90 + movdqa state1,temp 91 + pslld $7,temp 92 + psrld $25,state1 93 + por temp,state1 94 + 95 + /* state1[0,1,2,3] = state1[1,2,3,0] */ 96 + pshufd $0x39,state1,state1 97 + /* state2[0,1,2,3] = state2[2,3,0,1] */ 98 + pshufd $0x4e,state2,state2 99 + /* state3[0,1,2,3] = state3[3,0,1,2] */ 100 + pshufd $0x93,state3,state3 101 + 102 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 103 + paddd state1,state0 104 + pxor state0,state3 105 + movdqa state3,temp 106 + pslld $16,temp 107 + psrld $16,state3 108 + por temp,state3 109 + 110 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111 + paddd state3,state2 112 + pxor state2,state1 113 + movdqa state1,temp 114 + pslld $12,temp 115 + psrld $20,state1 116 + por temp,state1 117 + 118 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 119 + paddd state1,state0 120 + pxor state0,state3 121 + movdqa state3,temp 122 + pslld $8,temp 123 + psrld $24,state3 124 + por temp,state3 125 + 126 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 127 + paddd state3,state2 128 + pxor state2,state1 129 + movdqa state1,temp 130 + pslld $7,temp 131 + psrld $25,state1 132 + por temp,state1 133 + 134 + /* state1[0,1,2,3] = state1[3,0,1,2] */ 135 + pshufd $0x93,state1,state1 136 + /* state2[0,1,2,3] = state2[2,3,0,1] */ 137 + pshufd $0x4e,state2,state2 138 + /* state3[0,1,2,3] = state3[1,2,3,0] */ 139 + pshufd $0x39,state3,state3 140 + 141 + decb i 142 + jnz .Lpermute 143 + 144 + /* output0 = state0 + copy0 */ 145 + paddd copy0,state0 146 + movups state0,0x00(output) 147 + /* output1 = state1 + copy1 */ 148 + paddd copy1,state1 149 + movups state1,0x10(output) 150 + /* output2 = state2 + copy2 */ 151 + paddd copy2,state2 152 + movups state2,0x20(output) 153 + /* output3 = state3 + copy3 */ 154 + paddd copy3,state3 155 + movups state3,0x30(output) 156 + 157 + /* ++copy3.counter */ 158 + paddq one,copy3 159 + 160 + /* output += 64, --nblocks */ 161 + addq $64,output 162 + decq nblocks 163 + jnz .Lblock 164 + 165 + /* counter = copy3.counter */ 166 + movq copy3,0x00(counter) 167 + 168 + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 169 + pxor state0,state0 170 + pxor state1,state1 171 + pxor state2,state2 172 + pxor state3,state3 173 + pxor copy1,copy1 174 + pxor copy2,copy2 175 + pxor temp,temp 176 + 177 + ret 178 + SYM_FUNC_END(__arch_chacha20_blocks_nostack)
+17
arch/x86/entry/vdso/vgetrandom.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + #include <linux/types.h> 6 + 7 + #include "../../../../lib/vdso/getrandom.c" 8 + 9 + ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); 10 + 11 + ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) 12 + { 13 + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); 14 + } 15 + 16 + ssize_t getrandom(void *, size_t, unsigned int, void *, size_t) 17 + __attribute__((weak, alias("__vdso_getrandom")));
+55
arch/x86/include/asm/vdso/getrandom.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + #ifndef __ASM_VDSO_GETRANDOM_H 6 + #define __ASM_VDSO_GETRANDOM_H 7 + 8 + #ifndef __ASSEMBLY__ 9 + 10 + #include <asm/unistd.h> 11 + #include <asm/vvar.h> 12 + 13 + /** 14 + * getrandom_syscall - Invoke the getrandom() syscall. 15 + * @buffer: Destination buffer to fill with random bytes. 16 + * @len: Size of @buffer in bytes. 17 + * @flags: Zero or more GRND_* flags. 18 + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. 19 + */ 20 + static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) 21 + { 22 + long ret; 23 + 24 + asm ("syscall" : "=a" (ret) : 25 + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : 26 + "rcx", "r11", "memory"); 27 + 28 + return ret; 29 + } 30 + 31 + #define __vdso_rng_data (VVAR(_vdso_rng_data)) 32 + 33 + static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) 34 + { 35 + if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) 36 + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); 37 + return &__vdso_rng_data; 38 + } 39 + 40 + /** 41 + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack. 42 + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output. 43 + * @key: 32-byte input key. 44 + * @counter: 8-byte counter, read on input and updated on return. 45 + * @nblocks: Number of blocks to generate. 46 + * 47 + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write 48 + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data 49 + * leaking into forked child processes. 50 + */ 51 + extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks); 52 + 53 + #endif /* !__ASSEMBLY__ */ 54 + 55 + #endif /* __ASM_VDSO_GETRANDOM_H */
+2
arch/x86/include/asm/vdso/vsyscall.h
··· 10 10 #include <asm/vvar.h> 11 11 12 12 DEFINE_VVAR(struct vdso_data, _vdso_data); 13 + DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); 14 + 13 15 /* 14 16 * Update the vDSO data page to keep in sync with kernel timekeeping. 15 17 */
+16
arch/x86/include/asm/vvar.h
··· 26 26 */ 27 27 #define DECLARE_VVAR(offset, type, name) \ 28 28 EMIT_VVAR(name, offset) 29 + #define DECLARE_VVAR_SINGLE(offset, type, name) \ 30 + EMIT_VVAR(name, offset) 29 31 30 32 #else 31 33 ··· 39 37 extern type timens_ ## name[CS_BASES] \ 40 38 __attribute__((visibility("hidden"))); \ 41 39 40 + #define DECLARE_VVAR_SINGLE(offset, type, name) \ 41 + extern type vvar_ ## name \ 42 + __attribute__((visibility("hidden"))); \ 43 + 42 44 #define VVAR(name) (vvar_ ## name) 43 45 #define TIMENS(name) (timens_ ## name) 44 46 45 47 #define DEFINE_VVAR(type, name) \ 46 48 type name[CS_BASES] \ 49 + __attribute__((section(".vvar_" #name), aligned(16))) __visible 50 + 51 + #define DEFINE_VVAR_SINGLE(type, name) \ 52 + type name \ 47 53 __attribute__((section(".vvar_" #name), aligned(16))) __visible 48 54 49 55 #endif ··· 60 50 61 51 DECLARE_VVAR(128, struct vdso_data, _vdso_data) 62 52 53 + #if !defined(_SINGLE_DATA) 54 + #define _SINGLE_DATA 55 + DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) 56 + #endif 57 + 63 58 #undef DECLARE_VVAR 59 + #undef DECLARE_VVAR_SINGLE 64 60 65 61 #endif
+17 -1
drivers/char/random.c
··· 1 1 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) 2 2 /* 3 - * Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 3 + * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 4 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005 5 5 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved. 6 6 * ··· 56 56 #include <linux/sched/isolation.h> 57 57 #include <crypto/chacha.h> 58 58 #include <crypto/blake2s.h> 59 + #ifdef CONFIG_VDSO_GETRANDOM 60 + #include <vdso/getrandom.h> 61 + #include <vdso/datapage.h> 62 + #endif 59 63 #include <asm/archrandom.h> 60 64 #include <asm/processor.h> 61 65 #include <asm/irq.h> ··· 275 271 if (next_gen == ULONG_MAX) 276 272 ++next_gen; 277 273 WRITE_ONCE(base_crng.generation, next_gen); 274 + #ifdef CONFIG_VDSO_GETRANDOM 275 + /* base_crng.generation's invalid value is ULONG_MAX, while 276 + * _vdso_rng_data.generation's invalid value is 0, so add one to the 277 + * former to arrive at the latter. Use smp_store_release so that this 278 + * is ordered with the write above to base_crng.generation. Pairs with 279 + * the smp_rmb() before the syscall in the vDSO code. 280 + */ 281 + smp_store_release(&_vdso_rng_data.generation, next_gen + 1); 282 + #endif 278 283 if (!static_branch_likely(&crng_is_ready)) 279 284 crng_init = CRNG_READY; 280 285 spin_unlock_irqrestore(&base_crng.lock, flags); ··· 734 721 if (static_key_initialized && system_unbound_wq) 735 722 queue_work(system_unbound_wq, &set_ready); 736 723 atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); 724 + #ifdef CONFIG_VDSO_GETRANDOM 725 + WRITE_ONCE(_vdso_rng_data.is_ready, true); 726 + #endif 737 727 wake_up_interruptible(&crng_init_wait); 738 728 kill_fasync(&fasync, SIGIO, POLL_IN); 739 729 pr_notice("crng init done\n");
+1
fs/proc/task_mmu.c
··· 988 988 [ilog2(VM_SHADOW_STACK)] = "ss", 989 989 #endif 990 990 #ifdef CONFIG_64BIT 991 + [ilog2(VM_DROPPABLE)] = "dp", 991 992 [ilog2(VM_SEALED)] = "sl", 992 993 #endif 993 994 };
+7
include/linux/mm.h
··· 407 407 #endif 408 408 409 409 #ifdef CONFIG_64BIT 410 + #define VM_DROPPABLE_BIT 40 411 + #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) 412 + #else 413 + #define VM_DROPPABLE VM_NONE 414 + #endif 415 + 416 + #ifdef CONFIG_64BIT 410 417 /* VM is sealed, in vm_flags */ 411 418 #define VM_SEALED _BITUL(63) 412 419 #endif
+3
include/linux/userfaultfd_k.h
··· 218 218 { 219 219 vm_flags &= __VM_UFFD_FLAGS; 220 220 221 + if (vm_flags & VM_DROPPABLE) 222 + return false; 223 + 221 224 if ((vm_flags & VM_UFFD_MINOR) && 222 225 (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) 223 226 return false;
+7
include/trace/events/mmflags.h
··· 165 165 # define IF_HAVE_UFFD_MINOR(flag, name) 166 166 #endif 167 167 168 + #ifdef CONFIG_64BIT 169 + # define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name}, 170 + #else 171 + # define IF_HAVE_VM_DROPPABLE(flag, name) 172 + #endif 173 + 168 174 #define __def_vmaflag_names \ 169 175 {VM_READ, "read" }, \ 170 176 {VM_WRITE, "write" }, \ ··· 203 197 {VM_MIXEDMAP, "mixedmap" }, \ 204 198 {VM_HUGEPAGE, "hugepage" }, \ 205 199 {VM_NOHUGEPAGE, "nohugepage" }, \ 200 + IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ 206 201 {VM_MERGEABLE, "mergeable" } \ 207 202 208 203 #define show_vma_flags(flags) \
+1
include/uapi/linux/mman.h
··· 17 17 #define MAP_SHARED 0x01 /* Share changes */ 18 18 #define MAP_PRIVATE 0x02 /* Changes are private */ 19 19 #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ 20 + #define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ 20 21 21 22 /* 22 23 * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+16 -1
include/uapi/linux/random.h
··· 20 20 /* Add to (or subtract from) the entropy count. (Superuser only.) */ 21 21 #define RNDADDTOENTCNT _IOW( 'R', 0x01, int ) 22 22 23 - /* Get the contents of the entropy pool. (Superuser only.) */ 23 + /* Get the contents of the entropy pool. (Superuser only.) (Removed in 2.6.9-rc2.) */ 24 24 #define RNDGETPOOL _IOR( 'R', 0x02, int [2] ) 25 25 26 26 /* ··· 54 54 #define GRND_NONBLOCK 0x0001 55 55 #define GRND_RANDOM 0x0002 56 56 #define GRND_INSECURE 0x0004 57 + 58 + /** 59 + * struct vgetrandom_opaque_params - arguments for allocating memory for vgetrandom 60 + * 61 + * @size_per_opaque_state: Size of each state that is to be passed to vgetrandom(). 62 + * @mmap_prot: Value of the prot argument in mmap(2). 63 + * @mmap_flags: Value of the flags argument in mmap(2). 64 + * @reserved: Reserved for future use. 65 + */ 66 + struct vgetrandom_opaque_params { 67 + __u32 size_of_opaque_state; 68 + __u32 mmap_prot; 69 + __u32 mmap_flags; 70 + __u32 reserved[13]; 71 + }; 57 72 58 73 #endif /* _UAPI_LINUX_RANDOM_H */
+11
include/vdso/datapage.h
··· 117 117 struct arch_vdso_data arch_data; 118 118 }; 119 119 120 + /** 121 + * struct vdso_rng_data - vdso RNG state information 122 + * @generation: counter representing the number of RNG reseeds 123 + * @is_ready: boolean signaling whether the RNG is initialized 124 + */ 125 + struct vdso_rng_data { 126 + u64 generation; 127 + u8 is_ready; 128 + }; 129 + 120 130 /* 121 131 * We use the hidden visibility to prevent the compiler from generating a GOT 122 132 * relocation. Not only is going through a GOT useless (the entry couldn't and ··· 138 128 */ 139 129 extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); 140 130 extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); 131 + extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden"))); 141 132 142 133 /** 143 134 * union vdso_data_store - Generic vDSO data page
+46
include/vdso/getrandom.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #ifndef _VDSO_GETRANDOM_H 7 + #define _VDSO_GETRANDOM_H 8 + 9 + #include <linux/types.h> 10 + 11 + #define CHACHA_KEY_SIZE 32 12 + #define CHACHA_BLOCK_SIZE 64 13 + 14 + /** 15 + * struct vgetrandom_state - State used by vDSO getrandom(). 16 + * 17 + * @batch: One and a half ChaCha20 blocks of buffered RNG output. 18 + * 19 + * @key: Key to be used for generating next batch. 20 + * 21 + * @batch_key: Union of the prior two members, which is exactly two full 22 + * ChaCha20 blocks in size, so that @batch and @key can be filled 23 + * together. 24 + * 25 + * @generation: Snapshot of @rng_info->generation in the vDSO data page at 26 + * the time @key was generated. 27 + * 28 + * @pos: Offset into @batch of the next available random byte. 29 + * 30 + * @in_use: Reentrancy guard for reusing a state within the same thread 31 + * due to signal handlers. 32 + */ 33 + struct vgetrandom_state { 34 + union { 35 + struct { 36 + u8 batch[CHACHA_BLOCK_SIZE * 3 / 2]; 37 + u32 key[CHACHA_KEY_SIZE / sizeof(u32)]; 38 + }; 39 + u8 batch_key[CHACHA_BLOCK_SIZE * 2]; 40 + }; 41 + u64 generation; 42 + u8 pos; 43 + bool in_use; 44 + }; 45 + 46 + #endif /* _VDSO_GETRANDOM_H */
+5
lib/vdso/Kconfig
··· 38 38 in the hotpath. 39 39 40 40 endif 41 + 42 + config VDSO_GETRANDOM 43 + bool 44 + help 45 + Selected by architectures that support vDSO getrandom().
+251
lib/vdso/getrandom.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <linux/cache.h> 7 + #include <linux/kernel.h> 8 + #include <linux/time64.h> 9 + #include <vdso/datapage.h> 10 + #include <vdso/getrandom.h> 11 + #include <asm/vdso/getrandom.h> 12 + #include <asm/vdso/vsyscall.h> 13 + #include <asm/unaligned.h> 14 + #include <uapi/linux/mman.h> 15 + 16 + #define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \ 17 + while (len >= sizeof(type)) { \ 18 + __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \ 19 + __put_unaligned_t(type, 0, src); \ 20 + dst += sizeof(type); \ 21 + src += sizeof(type); \ 22 + len -= sizeof(type); \ 23 + } \ 24 + } while (0) 25 + 26 + static void memcpy_and_zero_src(void *dst, void *src, size_t len) 27 + { 28 + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { 29 + if (IS_ENABLED(CONFIG_64BIT)) 30 + MEMCPY_AND_ZERO_SRC(u64, dst, src, len); 31 + MEMCPY_AND_ZERO_SRC(u32, dst, src, len); 32 + MEMCPY_AND_ZERO_SRC(u16, dst, src, len); 33 + } 34 + MEMCPY_AND_ZERO_SRC(u8, dst, src, len); 35 + } 36 + 37 + /** 38 + * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall. 39 + * @rng_info: Describes state of kernel RNG, memory shared with kernel. 40 + * @buffer: Destination buffer to fill with random bytes. 41 + * @len: Size of @buffer in bytes. 42 + * @flags: Zero or more GRND_* flags. 43 + * @opaque_state: Pointer to an opaque state area. 44 + * @opaque_len: Length of opaque state area. 45 + * 46 + * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's 47 + * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same 48 + * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always 49 + * calls into the syscall. 50 + * 51 + * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated 52 + * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0, 53 + * this function should not be used. 54 + * 55 + * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields 56 + * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external 57 + * locking is used, one state must be allocated per thread, as it is not safe to call this function 58 + * concurrently with the same @opaque_state. However, it is safe to call this using the same 59 + * @opaque_state that is shared between main code and signal handling code, within the same thread. 60 + * 61 + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. 62 + */ 63 + static __always_inline ssize_t 64 + __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len, 65 + unsigned int flags, void *opaque_state, size_t opaque_len) 66 + { 67 + ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len); 68 + struct vgetrandom_state *state = opaque_state; 69 + size_t batch_len, nblocks, orig_len = len; 70 + bool in_use, have_retried = false; 71 + unsigned long current_generation; 72 + void *orig_buffer = buffer; 73 + u32 counter[2] = { 0 }; 74 + 75 + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { 76 + *(struct vgetrandom_opaque_params *)opaque_state = (struct vgetrandom_opaque_params) { 77 + .size_of_opaque_state = sizeof(*state), 78 + .mmap_prot = PROT_READ | PROT_WRITE, 79 + .mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS 80 + }; 81 + return 0; 82 + } 83 + 84 + /* The state must not straddle a page, since pages can be zeroed at any time. */ 85 + if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) 86 + return -EFAULT; 87 + 88 + /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */ 89 + if (unlikely(opaque_len != sizeof(*state))) 90 + goto fallback_syscall; 91 + 92 + /* 93 + * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from 94 + * userspace, because A) the various @flags require this to block, or not, depending on 95 + * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is 96 + * ready is to reseed from the entropy pool at every invocation. 97 + */ 98 + if (unlikely(!READ_ONCE(rng_info->is_ready))) 99 + goto fallback_syscall; 100 + 101 + /* 102 + * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is 103 + * initialized, the @flags parameter may require this to block or return an error, even when 104 + * len is zero. 105 + */ 106 + if (unlikely(!len)) 107 + return 0; 108 + 109 + /* 110 + * @state->in_use is basic reentrancy protection against this running in a signal handler 111 + * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one 112 + * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before 113 + * writing @state->in_use, there is still no race, because the signal handler will run to 114 + * its completion before returning execution. 115 + */ 116 + in_use = READ_ONCE(state->in_use); 117 + if (unlikely(in_use)) 118 + /* The syscall simply fills the buffer and does not touch @state, so fallback. */ 119 + goto fallback_syscall; 120 + WRITE_ONCE(state->in_use, true); 121 + 122 + retry_generation: 123 + /* 124 + * @rng_info->generation must always be read here, as it serializes @state->key with the 125 + * kernel's RNG reseeding schedule. 126 + */ 127 + current_generation = READ_ONCE(rng_info->generation); 128 + 129 + /* 130 + * If @state->generation doesn't match the kernel RNG's generation, then it means the 131 + * kernel's RNG has reseeded, and so @state->key is reseeded as well. 132 + */ 133 + if (unlikely(state->generation != current_generation)) { 134 + /* 135 + * Write the generation before filling the key, in case of fork. If there is a fork 136 + * just after this line, the parent and child will get different random bytes from 137 + * the syscall, which is good. However, were this line to occur after the getrandom 138 + * syscall, then both child and parent could have the same bytes and the same 139 + * generation counter, so the fork would not be detected. Therefore, write 140 + * @state->generation before the call to the getrandom syscall. 141 + */ 142 + WRITE_ONCE(state->generation, current_generation); 143 + 144 + /* 145 + * Prevent the syscall from being reordered wrt current_generation. Pairs with the 146 + * smp_store_release(&_vdso_rng_data.generation) in random.c. 147 + */ 148 + smp_rmb(); 149 + 150 + /* Reseed @state->key using fresh bytes from the kernel. */ 151 + if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) { 152 + /* 153 + * If the syscall failed to refresh the key, then @state->key is now 154 + * invalid, so invalidate the generation so that it is not used again, and 155 + * fallback to using the syscall entirely. 156 + */ 157 + WRITE_ONCE(state->generation, 0); 158 + 159 + /* 160 + * Set @state->in_use to false only after the last write to @state in the 161 + * line above. 162 + */ 163 + WRITE_ONCE(state->in_use, false); 164 + 165 + goto fallback_syscall; 166 + } 167 + 168 + /* 169 + * Set @state->pos to beyond the end of the batch, so that the batch is refilled 170 + * using the new key. 171 + */ 172 + state->pos = sizeof(state->batch); 173 + } 174 + 175 + /* Set len to the total amount of bytes that this function is allowed to read, ret. */ 176 + len = ret; 177 + more_batch: 178 + /* 179 + * First use bytes out of @state->batch, which may have been filled by the last call to this 180 + * function. 181 + */ 182 + batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len); 183 + if (batch_len) { 184 + /* Zeroing at the same time as memcpying helps preserve forward secrecy. */ 185 + memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len); 186 + state->pos += batch_len; 187 + buffer += batch_len; 188 + len -= batch_len; 189 + } 190 + 191 + if (!len) { 192 + /* Prevent the loop from being reordered wrt ->generation. */ 193 + barrier(); 194 + 195 + /* 196 + * Since @rng_info->generation will never be 0, re-read @state->generation, rather 197 + * than using the local current_generation variable, to learn whether a fork 198 + * occurred or if @state was zeroed due to memory pressure. Primarily, though, this 199 + * indicates whether the kernel's RNG has reseeded, in which case generate a new key 200 + * and start over. 201 + */ 202 + if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) { 203 + /* 204 + * Prevent this from looping forever in case of low memory or racing with a 205 + * user force-reseeding the kernel's RNG using the ioctl. 206 + */ 207 + if (have_retried) { 208 + WRITE_ONCE(state->in_use, false); 209 + goto fallback_syscall; 210 + } 211 + 212 + have_retried = true; 213 + buffer = orig_buffer; 214 + goto retry_generation; 215 + } 216 + 217 + /* 218 + * Set @state->in_use to false only when there will be no more reads or writes of 219 + * @state. 220 + */ 221 + WRITE_ONCE(state->in_use, false); 222 + return ret; 223 + } 224 + 225 + /* Generate blocks of RNG output directly into @buffer while there's enough room left. */ 226 + nblocks = len / CHACHA_BLOCK_SIZE; 227 + if (nblocks) { 228 + __arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks); 229 + buffer += nblocks * CHACHA_BLOCK_SIZE; 230 + len -= nblocks * CHACHA_BLOCK_SIZE; 231 + } 232 + 233 + BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0); 234 + 235 + /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */ 236 + __arch_chacha20_blocks_nostack(state->batch_key, state->key, counter, 237 + sizeof(state->batch_key) / CHACHA_BLOCK_SIZE); 238 + 239 + /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */ 240 + state->pos = 0; 241 + goto more_batch; 242 + 243 + fallback_syscall: 244 + return getrandom_syscall(orig_buffer, orig_len, flags); 245 + } 246 + 247 + static __always_inline ssize_t 248 + __cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) 249 + { 250 + return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len); 251 + }
+1 -1
mm/ksm.c
··· 713 713 { 714 714 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | 715 715 VM_IO | VM_DONTEXPAND | VM_HUGETLB | 716 - VM_MIXEDMAP)) 716 + VM_MIXEDMAP| VM_DROPPABLE)) 717 717 return false; /* just ignore the advice */ 718 718 719 719 if (vma_is_dax(vma))
+4 -1
mm/madvise.c
··· 1068 1068 new_flags |= VM_WIPEONFORK; 1069 1069 break; 1070 1070 case MADV_KEEPONFORK: 1071 + if (vma->vm_flags & VM_DROPPABLE) 1072 + return -EINVAL; 1071 1073 new_flags &= ~VM_WIPEONFORK; 1072 1074 break; 1073 1075 case MADV_DONTDUMP: 1074 1076 new_flags |= VM_DONTDUMP; 1075 1077 break; 1076 1078 case MADV_DODUMP: 1077 - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1079 + if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || 1080 + (vma->vm_flags & VM_DROPPABLE)) 1078 1081 return -EINVAL; 1079 1082 new_flags &= ~VM_DONTDUMP; 1080 1083 break;
+13
mm/memory.c
··· 5801 5801 /* If the fault handler drops the mmap_lock, vma may be freed */ 5802 5802 struct mm_struct *mm = vma->vm_mm; 5803 5803 vm_fault_t ret; 5804 + bool is_droppable; 5804 5805 5805 5806 __set_current_state(TASK_RUNNING); 5806 5807 ··· 5815 5814 ret = VM_FAULT_SIGSEGV; 5816 5815 goto out; 5817 5816 } 5817 + 5818 + is_droppable = !!(vma->vm_flags & VM_DROPPABLE); 5818 5819 5819 5820 /* 5820 5821 * Enable the memcg OOM handling for faults triggered in user ··· 5832 5829 else 5833 5830 ret = __handle_mm_fault(vma, address, flags); 5834 5831 5832 + /* 5833 + * Warning: It is no longer safe to dereference vma-> after this point, 5834 + * because mmap_lock might have been dropped by __handle_mm_fault(), so 5835 + * vma might be destroyed from underneath us. 5836 + */ 5837 + 5835 5838 lru_gen_exit_fault(); 5839 + 5840 + /* If the mapping is droppable, then errors due to OOM aren't fatal. */ 5841 + if (is_droppable) 5842 + ret &= ~VM_FAULT_OOM; 5836 5843 5837 5844 if (flags & FAULT_FLAG_USER) { 5838 5845 mem_cgroup_exit_user_fault();
+3
mm/mempolicy.c
··· 2305 2305 pgoff_t ilx; 2306 2306 struct folio *folio; 2307 2307 2308 + if (vma->vm_flags & VM_DROPPABLE) 2309 + gfp |= __GFP_NOWARN; 2310 + 2308 2311 pol = get_vma_policy(vma, addr, order, &ilx); 2309 2312 folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); 2310 2313 mpol_cond_put(pol);
+1 -1
mm/mlock.c
··· 474 474 475 475 if (newflags == oldflags || (oldflags & VM_SPECIAL) || 476 476 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || 477 - vma_is_dax(vma) || vma_is_secretmem(vma)) 477 + vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) 478 478 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ 479 479 goto out; 480 480
+30
mm/mmap.c
··· 1410 1410 pgoff = 0; 1411 1411 vm_flags |= VM_SHARED | VM_MAYSHARE; 1412 1412 break; 1413 + case MAP_DROPPABLE: 1414 + if (VM_DROPPABLE == VM_NONE) 1415 + return -ENOTSUPP; 1416 + /* 1417 + * A locked or stack area makes no sense to be droppable. 1418 + * 1419 + * Also, since droppable pages can just go away at any time 1420 + * it makes no sense to copy them on fork or dump them. 1421 + * 1422 + * And don't attempt to combine with hugetlb for now. 1423 + */ 1424 + if (flags & (MAP_LOCKED | MAP_HUGETLB)) 1425 + return -EINVAL; 1426 + if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) 1427 + return -EINVAL; 1428 + 1429 + vm_flags |= VM_DROPPABLE; 1430 + 1431 + /* 1432 + * If the pages can be dropped, then it doesn't make 1433 + * sense to reserve them. 1434 + */ 1435 + vm_flags |= VM_NORESERVE; 1436 + 1437 + /* 1438 + * Likewise, they're volatile enough that they 1439 + * shouldn't survive forks or coredumps. 1440 + */ 1441 + vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; 1442 + fallthrough; 1413 1443 case MAP_PRIVATE: 1414 1444 /* 1415 1445 * Set pgoff according to addr for anon_vma.
+18 -3
mm/rmap.c
··· 1412 1412 VM_BUG_ON_VMA(address < vma->vm_start || 1413 1413 address + (nr << PAGE_SHIFT) > vma->vm_end, vma); 1414 1414 1415 - if (!folio_test_swapbacked(folio)) 1415 + /* 1416 + * VM_DROPPABLE mappings don't swap; instead they're just dropped when 1417 + * under memory pressure. 1418 + */ 1419 + if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) 1416 1420 __folio_set_swapbacked(folio); 1417 1421 __folio_set_anon(folio, vma, address, exclusive); 1418 1422 ··· 1852 1848 * plus the rmap(s) (dropped by discard:). 1853 1849 */ 1854 1850 if (ref_count == 1 + map_count && 1855 - !folio_test_dirty(folio)) { 1851 + (!folio_test_dirty(folio) || 1852 + /* 1853 + * Unlike MADV_FREE mappings, VM_DROPPABLE 1854 + * ones can be dropped even if they've 1855 + * been dirtied. 1856 + */ 1857 + (vma->vm_flags & VM_DROPPABLE))) { 1856 1858 dec_mm_counter(mm, MM_ANONPAGES); 1857 1859 goto discard; 1858 1860 } ··· 1868 1858 * discarded. Remap the page to page table. 1869 1859 */ 1870 1860 set_pte_at(mm, address, pvmw.pte, pteval); 1871 - folio_set_swapbacked(folio); 1861 + /* 1862 + * Unlike MADV_FREE mappings, VM_DROPPABLE ones 1863 + * never get swap backed on failure to drop. 1864 + */ 1865 + if (!(vma->vm_flags & VM_DROPPABLE)) 1866 + folio_set_swapbacked(folio); 1872 1867 goto walk_abort; 1873 1868 } 1874 1869
-9
mm/vmscan.c
··· 4301 4301 return true; 4302 4302 } 4303 4303 4304 - /* dirty lazyfree */ 4305 - if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4306 - success = lru_gen_del_folio(lruvec, folio, true); 4307 - VM_WARN_ON_ONCE_FOLIO(!success, folio); 4308 - folio_set_swapbacked(folio); 4309 - lruvec_add_folio_tail(lruvec, folio); 4310 - return true; 4311 - } 4312 - 4313 4304 /* promoted */ 4314 4305 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4315 4306 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
tools/include/asm/rwonce.h
+1
tools/include/uapi/linux/mman.h
··· 17 17 #define MAP_SHARED 0x01 /* Share changes */ 18 18 #define MAP_PRIVATE 0x02 /* Changes are private */ 19 19 #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ 20 + #define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ 20 21 21 22 /* 22 23 * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+1
tools/testing/selftests/mm/.gitignore
··· 50 50 hugetlb_madv_vs_map 51 51 mseal_test 52 52 seal_elf 53 + droppable
+1
tools/testing/selftests/mm/Makefile
··· 76 76 TEST_GEN_FILES += hugetlb_fault_after_madv 77 77 TEST_GEN_FILES += hugetlb_madv_vs_map 78 78 TEST_GEN_FILES += hugetlb_dio 79 + TEST_GEN_FILES += droppable 79 80 80 81 ifneq ($(ARCH),arm64) 81 82 TEST_GEN_FILES += soft-dirty
+53
tools/testing/selftests/mm/droppable.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <assert.h> 7 + #include <stdbool.h> 8 + #include <stdint.h> 9 + #include <stdio.h> 10 + #include <stdlib.h> 11 + #include <unistd.h> 12 + #include <signal.h> 13 + #include <sys/mman.h> 14 + #include <linux/mman.h> 15 + 16 + #include "../kselftest.h" 17 + 18 + int main(int argc, char *argv[]) 19 + { 20 + size_t alloc_size = 134217728; 21 + size_t page_size = getpagesize(); 22 + void *alloc; 23 + pid_t child; 24 + 25 + ksft_print_header(); 26 + ksft_set_plan(1); 27 + 28 + alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); 29 + assert(alloc != MAP_FAILED); 30 + memset(alloc, 'A', alloc_size); 31 + for (size_t i = 0; i < alloc_size; i += page_size) 32 + assert(*(uint8_t *)(alloc + i)); 33 + 34 + child = fork(); 35 + assert(child >= 0); 36 + if (!child) { 37 + for (;;) 38 + *(char *)malloc(page_size) = 'B'; 39 + } 40 + 41 + for (bool done = false; !done;) { 42 + for (size_t i = 0; i < alloc_size; i += page_size) { 43 + if (!*(uint8_t *)(alloc + i)) { 44 + done = true; 45 + break; 46 + } 47 + } 48 + } 49 + kill(child, SIGTERM); 50 + 51 + ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); 52 + exit(KSFT_PASS); 53 + }
+2
tools/testing/selftests/vDSO/.gitignore
··· 6 6 vdso_test_gettimeofday 7 7 vdso_test_getcpu 8 8 vdso_standalone_test_x86 9 + vdso_test_getrandom 10 + vdso_test_chacha
+18
tools/testing/selftests/vDSO/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 uname_M := $(shell uname -m 2>/dev/null || echo not) 3 3 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) 4 + SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) 4 5 5 6 TEST_GEN_PROGS := vdso_test_gettimeofday 6 7 TEST_GEN_PROGS += vdso_test_getcpu ··· 11 10 TEST_GEN_PROGS += vdso_standalone_test_x86 12 11 endif 13 12 TEST_GEN_PROGS += vdso_test_correctness 13 + ifeq ($(uname_M),x86_64) 14 + TEST_GEN_PROGS += vdso_test_getrandom 15 + ifneq ($(SODIUM),) 16 + TEST_GEN_PROGS += vdso_test_chacha 17 + endif 18 + endif 14 19 15 20 CFLAGS := -std=gnu99 16 21 ··· 35 28 36 29 $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c 37 30 $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl 31 + 32 + $(OUTPUT)/vdso_test_getrandom: parse_vdso.c 33 + $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ 34 + -isystem $(top_srcdir)/include/uapi 35 + 36 + $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S 37 + $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ 38 + -isystem $(top_srcdir)/arch/$(ARCH)/include \ 39 + -isystem $(top_srcdir)/include \ 40 + -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ 41 + -Wa,--noexecstack $(SODIUM)
+43
tools/testing/selftests/vDSO/vdso_test_chacha.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <sodium/crypto_stream_chacha20.h> 7 + #include <sys/random.h> 8 + #include <string.h> 9 + #include <stdint.h> 10 + #include "../kselftest.h" 11 + 12 + extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); 13 + 14 + int main(int argc, char *argv[]) 15 + { 16 + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; 17 + static const uint8_t nonce[8] = { 0 }; 18 + uint32_t counter[2]; 19 + uint8_t key[32]; 20 + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; 21 + 22 + ksft_print_header(); 23 + ksft_set_plan(1); 24 + 25 + for (unsigned int trial = 0; trial < TRIALS; ++trial) { 26 + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { 27 + printf("getrandom() failed!\n"); 28 + return KSFT_SKIP; 29 + } 30 + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); 31 + for (unsigned int split = 0; split < BLOCKS; ++split) { 32 + memset(output2, 'X', sizeof(output2)); 33 + memset(counter, 0, sizeof(counter)); 34 + if (split) 35 + __arch_chacha20_blocks_nostack(output2, key, counter, split); 36 + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); 37 + if (memcmp(output1, output2, sizeof(output1))) 38 + return KSFT_FAIL; 39 + } 40 + } 41 + ksft_test_result_pass("chacha: PASS\n"); 42 + return KSFT_PASS; 43 + }
+288
tools/testing/selftests/vDSO/vdso_test_getrandom.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <assert.h> 7 + #include <pthread.h> 8 + #include <stdint.h> 9 + #include <stdio.h> 10 + #include <stdlib.h> 11 + #include <string.h> 12 + #include <time.h> 13 + #include <unistd.h> 14 + #include <signal.h> 15 + #include <sys/auxv.h> 16 + #include <sys/mman.h> 17 + #include <sys/random.h> 18 + #include <sys/syscall.h> 19 + #include <sys/types.h> 20 + #include <linux/random.h> 21 + 22 + #include "../kselftest.h" 23 + #include "parse_vdso.h" 24 + 25 + #ifndef timespecsub 26 + #define timespecsub(tsp, usp, vsp) \ 27 + do { \ 28 + (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ 29 + (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ 30 + if ((vsp)->tv_nsec < 0) { \ 31 + (vsp)->tv_sec--; \ 32 + (vsp)->tv_nsec += 1000000000L; \ 33 + } \ 34 + } while (0) 35 + #endif 36 + 37 + static struct { 38 + pthread_mutex_t lock; 39 + void **states; 40 + size_t len, cap; 41 + } grnd_allocator = { 42 + .lock = PTHREAD_MUTEX_INITIALIZER 43 + }; 44 + 45 + static struct { 46 + ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t); 47 + pthread_key_t key; 48 + pthread_once_t initialized; 49 + struct vgetrandom_opaque_params params; 50 + } grnd_ctx = { 51 + .initialized = PTHREAD_ONCE_INIT 52 + }; 53 + 54 + static void *vgetrandom_get_state(void) 55 + { 56 + void *state = NULL; 57 + 58 + pthread_mutex_lock(&grnd_allocator.lock); 59 + if (!grnd_allocator.len) { 60 + size_t page_size = getpagesize(); 61 + size_t new_cap; 62 + size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */ 63 + void *new_block, *new_states; 64 + 65 + alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1)); 66 + num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size); 67 + new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0); 68 + if (new_block == MAP_FAILED) 69 + goto out; 70 + 71 + new_cap = grnd_allocator.cap + num; 72 + new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states)); 73 + if (!new_states) 74 + goto unmap; 75 + grnd_allocator.cap = new_cap; 76 + grnd_allocator.states = new_states; 77 + 78 + for (size_t i = 0; i < num; ++i) { 79 + if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size) 80 + new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1))); 81 + grnd_allocator.states[i] = new_block; 82 + new_block += grnd_ctx.params.size_of_opaque_state; 83 + } 84 + grnd_allocator.len = num; 85 + goto success; 86 + 87 + unmap: 88 + munmap(new_block, alloc_size); 89 + goto out; 90 + } 91 + success: 92 + state = grnd_allocator.states[--grnd_allocator.len]; 93 + 94 + out: 95 + pthread_mutex_unlock(&grnd_allocator.lock); 96 + return state; 97 + } 98 + 99 + static void vgetrandom_put_state(void *state) 100 + { 101 + if (!state) 102 + return; 103 + pthread_mutex_lock(&grnd_allocator.lock); 104 + grnd_allocator.states[grnd_allocator.len++] = state; 105 + pthread_mutex_unlock(&grnd_allocator.lock); 106 + } 107 + 108 + static void vgetrandom_init(void) 109 + { 110 + if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0) 111 + return; 112 + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); 113 + if (!sysinfo_ehdr) { 114 + printf("AT_SYSINFO_EHDR is not present!\n"); 115 + exit(KSFT_SKIP); 116 + } 117 + vdso_init_from_sysinfo_ehdr(sysinfo_ehdr); 118 + grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); 119 + if (!grnd_ctx.fn) { 120 + printf("__vdso_getrandom is missing!\n"); 121 + exit(KSFT_FAIL); 122 + } 123 + if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) { 124 + printf("failed to fetch vgetrandom params!\n"); 125 + exit(KSFT_FAIL); 126 + } 127 + } 128 + 129 + static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) 130 + { 131 + void *state; 132 + 133 + pthread_once(&grnd_ctx.initialized, vgetrandom_init); 134 + state = pthread_getspecific(grnd_ctx.key); 135 + if (!state) { 136 + state = vgetrandom_get_state(); 137 + if (pthread_setspecific(grnd_ctx.key, state) != 0) { 138 + vgetrandom_put_state(state); 139 + state = NULL; 140 + } 141 + if (!state) { 142 + printf("vgetrandom_get_state failed!\n"); 143 + exit(KSFT_FAIL); 144 + } 145 + } 146 + return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state); 147 + } 148 + 149 + enum { TRIALS = 25000000, THREADS = 256 }; 150 + 151 + static void *test_vdso_getrandom(void *) 152 + { 153 + for (size_t i = 0; i < TRIALS; ++i) { 154 + unsigned int val; 155 + ssize_t ret = vgetrandom(&val, sizeof(val), 0); 156 + assert(ret == sizeof(val)); 157 + } 158 + return NULL; 159 + } 160 + 161 + static void *test_libc_getrandom(void *) 162 + { 163 + for (size_t i = 0; i < TRIALS; ++i) { 164 + unsigned int val; 165 + ssize_t ret = getrandom(&val, sizeof(val), 0); 166 + assert(ret == sizeof(val)); 167 + } 168 + return NULL; 169 + } 170 + 171 + static void *test_syscall_getrandom(void *) 172 + { 173 + for (size_t i = 0; i < TRIALS; ++i) { 174 + unsigned int val; 175 + ssize_t ret = syscall(__NR_getrandom, &val, sizeof(val), 0); 176 + assert(ret == sizeof(val)); 177 + } 178 + return NULL; 179 + } 180 + 181 + static void bench_single(void) 182 + { 183 + struct timespec start, end, diff; 184 + 185 + clock_gettime(CLOCK_MONOTONIC, &start); 186 + test_vdso_getrandom(NULL); 187 + clock_gettime(CLOCK_MONOTONIC, &end); 188 + timespecsub(&end, &start, &diff); 189 + printf(" vdso: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); 190 + 191 + clock_gettime(CLOCK_MONOTONIC, &start); 192 + test_libc_getrandom(NULL); 193 + clock_gettime(CLOCK_MONOTONIC, &end); 194 + timespecsub(&end, &start, &diff); 195 + printf(" libc: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); 196 + 197 + clock_gettime(CLOCK_MONOTONIC, &start); 198 + test_syscall_getrandom(NULL); 199 + clock_gettime(CLOCK_MONOTONIC, &end); 200 + timespecsub(&end, &start, &diff); 201 + printf("syscall: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); 202 + } 203 + 204 + static void bench_multi(void) 205 + { 206 + struct timespec start, end, diff; 207 + pthread_t threads[THREADS]; 208 + 209 + clock_gettime(CLOCK_MONOTONIC, &start); 210 + for (size_t i = 0; i < THREADS; ++i) 211 + assert(pthread_create(&threads[i], NULL, test_vdso_getrandom, NULL) == 0); 212 + for (size_t i = 0; i < THREADS; ++i) 213 + pthread_join(threads[i], NULL); 214 + clock_gettime(CLOCK_MONOTONIC, &end); 215 + timespecsub(&end, &start, &diff); 216 + printf(" vdso: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); 217 + 218 + clock_gettime(CLOCK_MONOTONIC, &start); 219 + for (size_t i = 0; i < THREADS; ++i) 220 + assert(pthread_create(&threads[i], NULL, test_libc_getrandom, NULL) == 0); 221 + for (size_t i = 0; i < THREADS; ++i) 222 + pthread_join(threads[i], NULL); 223 + clock_gettime(CLOCK_MONOTONIC, &end); 224 + timespecsub(&end, &start, &diff); 225 + printf(" libc: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); 226 + 227 + clock_gettime(CLOCK_MONOTONIC, &start); 228 + for (size_t i = 0; i < THREADS; ++i) 229 + assert(pthread_create(&threads[i], NULL, test_syscall_getrandom, NULL) == 0); 230 + for (size_t i = 0; i < THREADS; ++i) 231 + pthread_join(threads[i], NULL); 232 + clock_gettime(CLOCK_MONOTONIC, &end); 233 + timespecsub(&end, &start, &diff); 234 + printf(" syscall: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); 235 + } 236 + 237 + static void fill(void) 238 + { 239 + uint8_t weird_size[323929]; 240 + for (;;) 241 + vgetrandom(weird_size, sizeof(weird_size), 0); 242 + } 243 + 244 + static void kselftest(void) 245 + { 246 + uint8_t weird_size[1263]; 247 + 248 + ksft_print_header(); 249 + ksft_set_plan(1); 250 + 251 + for (size_t i = 0; i < 1000; ++i) { 252 + ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0); 253 + if (ret != sizeof(weird_size)) 254 + exit(KSFT_FAIL); 255 + } 256 + 257 + ksft_test_result_pass("getrandom: PASS\n"); 258 + exit(KSFT_PASS); 259 + } 260 + 261 + static void usage(const char *argv0) 262 + { 263 + fprintf(stderr, "Usage: %s [bench-single|bench-multi|fill]\n", argv0); 264 + } 265 + 266 + int main(int argc, char *argv[]) 267 + { 268 + if (argc == 1) { 269 + kselftest(); 270 + return 0; 271 + } 272 + 273 + if (argc != 2) { 274 + usage(argv[0]); 275 + return 1; 276 + } 277 + if (!strcmp(argv[1], "bench-single")) 278 + bench_single(); 279 + else if (!strcmp(argv[1], "bench-multi")) 280 + bench_multi(); 281 + else if (!strcmp(argv[1], "fill")) 282 + fill(); 283 + else { 284 + usage(argv[0]); 285 + return 1; 286 + } 287 + return 0; 288 + }